[llvm] eeac0ff - Revert "[MachineLICM] Use `RegisterClassInfo::getRegPressureSetLimit` (#119826)"
Nikita Popov via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 10 00:05:26 PST 2025
Author: Nikita Popov
Date: 2025-01-10T09:05:06+01:00
New Revision: eeac0ffaf46cf9f9b0f680b9940cc4b68a0286d8
URL: https://github.com/llvm/llvm-project/commit/eeac0ffaf46cf9f9b0f680b9940cc4b68a0286d8
DIFF: https://github.com/llvm/llvm-project/commit/eeac0ffaf46cf9f9b0f680b9940cc4b68a0286d8.diff
LOG: Revert "[MachineLICM] Use `RegisterClassInfo::getRegPressureSetLimit` (#119826)"
This reverts commit b4e17d4a314ed87ff6b40b4b05397d4b25b6636a.
This causes a large compile-time regression.
Added:
Modified:
llvm/lib/CodeGen/MachineLICM.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
llvm/test/CodeGen/AMDGPU/licm-regpressure.mir
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
llvm/test/CodeGen/AMDGPU/sdiv64.ll
llvm/test/CodeGen/AMDGPU/srem64.ll
llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
llvm/test/CodeGen/AMDGPU/udiv64.ll
llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
llvm/test/CodeGen/AMDGPU/urem64.ll
llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
llvm/test/CodeGen/LoongArch/jr-without-ra.ll
llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 798c3461094a8d..d1d5509dc482a2 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -124,7 +124,6 @@ namespace {
const TargetRegisterInfo *TRI = nullptr;
const MachineFrameInfo *MFI = nullptr;
MachineRegisterInfo *MRI = nullptr;
- RegisterClassInfo RegClassInfo;
TargetSchedModel SchedModel;
bool PreRegAlloc = false;
bool HasProfileData = false;
@@ -393,7 +392,6 @@ bool MachineLICMImpl::run(MachineFunction &MF) {
MFI = &MF.getFrameInfo();
MRI = &MF.getRegInfo();
SchedModel.init(&ST);
- RegClassInfo.runOnMachineFunction(MF);
HasProfileData = MF.getFunction().hasProfileData();
@@ -410,7 +408,7 @@ bool MachineLICMImpl::run(MachineFunction &MF) {
std::fill(RegPressure.begin(), RegPressure.end(), 0);
RegLimit.resize(NumRPS);
for (unsigned i = 0, e = NumRPS; i != e; ++i)
- RegLimit[i] = RegClassInfo.getRegPressureSetLimit(i);
+ RegLimit[i] = TRI->getRegPressureSetLimit(MF, i);
}
if (HoistConstLoads)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index bd2bbb97983122..23f24a9dc9982a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -325,13 +325,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -370,13 +370,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -394,13 +394,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
+; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -469,21 +469,21 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB5_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -513,20 +513,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -536,20 +536,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB5_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -602,15 +602,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -640,15 +640,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -686,15 +686,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -712,15 +712,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -758,21 +758,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
+; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[8:9]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -795,22 +795,22 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off
+; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -840,21 +840,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
+; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v5, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v4, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -864,21 +864,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -918,13 +918,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -963,13 +963,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -987,13 +987,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
+; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1058,21 +1058,21 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB9_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1104,20 +1104,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB9_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1127,20 +1127,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB9_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1190,15 +1190,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1228,15 +1228,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1274,15 +1274,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1303,15 +1303,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: flat_load_dword v5, v[5:6]
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -1344,21 +1344,21 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
+; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[8:9]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1381,22 +1381,22 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1]
+; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -1427,21 +1427,21 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v5, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v4, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB11_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1451,24 +1451,24 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v6, v[0:1]
-; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v5, v[5:6]
+; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB11_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1504,21 +1504,20 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
-; GFX940-NEXT: v_mov_b32_e32 v0, s16
-; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX940-NEXT: v_mov_b32_e32 v2, s16
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
+; GFX940-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_max_f32_e32 v3, v1, v1
; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: v_max_f32_e32 v0, v2, v2
-; GFX940-NEXT: v_max_f32_e32 v1, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v1, v0
-; GFX940-NEXT: v_mov_b32_e32 v3, s16
+; GFX940-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v0, v3
; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1554,20 +1553,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, s20
-; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v2, s20
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1
; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2
-; GFX90A-NEXT: v_max_f32_e32 v1, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s20
+; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v0, v3
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1581,24 +1579,23 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v2, s20
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
+; GFX908-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v3, v1, v1
; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_max_f32_e32 v0, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v1, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v1, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s20
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT: v_max_f32_e32 v4, v0, v3
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB12_1
@@ -1609,24 +1606,23 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s20
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen
+; GFX8-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v1
; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4
-; GFX8-NEXT: v_max_f32_e32 v3, v0, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s20
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT: v_max_f32_e32 v4, v0, v3
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB12_1
@@ -1664,24 +1660,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v1, s16
-; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
+; GFX940-NEXT: v_mov_b32_e32 v2, s16
+; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_max_f32_e32 v3, v0, v0
; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v1, v3, v3
-; GFX940-NEXT: v_max_f32_e32 v2, v0, v0
-; GFX940-NEXT: v_max_f32_e32 v2, v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v6, s16
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX940-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX940-NEXT: s_cbranch_execnz .LBB13_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1713,23 +1708,22 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v2, s20
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v1, v3, v3
-; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0
-; GFX90A-NEXT: v_max_f32_e32 v2, v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, s20
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1739,24 +1733,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v2, s20
+; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v3, v0, v0
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v3, v0, v0
-; GFX908-NEXT: v_max_f32_e32 v1, v1, v3
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v5, s20
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1766,24 +1759,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v2, s20
+; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, s20
-; GFX8-NEXT: v_mov_b32_e32 v3, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1810,27 +1802,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
-; GFX12-NEXT: v_mov_b32_e32 v0, s16
+; GFX12-NEXT: v_mov_b32_e32 v6, s16
+; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_mov_b32 s4, 0
-; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen
+; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[0:1]
-; GFX12-NEXT: v_mov_b32_e32 v10, s16
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1854,29 +1845,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
-; GFX11-NEXT: v_mov_b32_e32 v0, s16
+; GFX11-NEXT: v_mov_b32_e32 v6, s16
+; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
-; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1]
-; GFX11-NEXT: v_mov_b32_e32 v10, s16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1908,28 +1897,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: v_mov_b32_e32 v5, v1
-; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v6, s20
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v3, v1
+; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v9, v1
-; GFX908-NEXT: v_mov_b32_e32 v8, v0
-; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX908-NEXT: v_mov_b32_e32 v10, s20
-; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1]
-; GFX908-NEXT: v_mov_b32_e32 v0, v6
-; GFX908-NEXT: v_mov_b32_e32 v1, v7
-; GFX908-NEXT: v_mov_b32_e32 v2, v8
-; GFX908-NEXT: v_mov_b32_e32 v3, v9
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v10, v1
+; GFX908-NEXT: v_mov_b32_e32 v9, v0
+; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
+; GFX908-NEXT: v_mov_b32_e32 v2, v9
+; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB14_1
@@ -1940,28 +1928,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: v_mov_b32_e32 v5, v1
-; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v6, s20
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v10, s20
-; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
-; GFX8-NEXT: v_mov_b32_e32 v1, v7
-; GFX8-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NEXT: v_mov_b32_e32 v3, v9
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v10, v1
+; GFX8-NEXT: v_mov_b32_e32 v9, v0
+; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
+; GFX8-NEXT: v_mov_b32_e32 v2, v9
+; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB14_1
@@ -1989,26 +1976,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, s16
+; GFX12-NEXT: v_mov_b32_e32 v6, s16
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
; GFX12-NEXT: s_mov_b32 s4, 0
-; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen
+; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen
; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[0:1]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, v5
-; GFX12-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT: v_mov_b32_e32 v6, v2
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -2032,28 +2017,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, s16
+; GFX11-NEXT: v_mov_b32_e32 v6, s16
+; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen
-; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen
; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1]
+; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, v5
-; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_mov_b32_e32 v6, v2
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2085,27 +2067,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, s20
-; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v6, s20
+; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
+; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1]
-; GFX908-NEXT: v_mov_b32_e32 v10, s20
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v9, v5
-; GFX908-NEXT: v_mov_b32_e32 v8, v4
-; GFX908-NEXT: v_mov_b32_e32 v7, v3
-; GFX908-NEXT: v_mov_b32_e32 v6, v2
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v10, v3
+; GFX908-NEXT: v_mov_b32_e32 v9, v2
+; GFX908-NEXT: v_mov_b32_e32 v8, v1
+; GFX908-NEXT: v_mov_b32_e32 v7, v0
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v6
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX908-NEXT: v_mov_b32_e32 v2, v7
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v5, v7
+; GFX908-NEXT: v_mov_b32_e32 v3, v8
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB15_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2115,27 +2096,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s20
-; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v6, s20
+; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
+; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v10, s20
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v9, v5
-; GFX8-NEXT: v_mov_b32_e32 v8, v4
-; GFX8-NEXT: v_mov_b32_e32 v7, v3
-; GFX8-NEXT: v_mov_b32_e32 v6, v2
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v10, v3
+; GFX8-NEXT: v_mov_b32_e32 v9, v2
+; GFX8-NEXT: v_mov_b32_e32 v8, v1
+; GFX8-NEXT: v_mov_b32_e32 v7, v0
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v6
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v2, v7
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v5, v7
+; GFX8-NEXT: v_mov_b32_e32 v3, v8
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB15_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
index b646b7f3010923..11024b0a88d6b7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
@@ -325,13 +325,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT: v_min_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -370,13 +370,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -394,13 +394,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
+; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -469,21 +469,21 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX940-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB5_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -513,20 +513,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -536,20 +536,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v5
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB5_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -602,15 +602,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -640,15 +640,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -686,15 +686,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -712,15 +712,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -758,21 +758,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
+; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[8:9]
+; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -795,22 +795,22 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off
+; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -840,21 +840,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
+; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v5, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v4, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -864,21 +864,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -918,13 +918,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT: v_min_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -963,13 +963,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -987,13 +987,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
+; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1058,21 +1058,21 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX940-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB9_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1104,20 +1104,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB9_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1127,20 +1127,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v5
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB9_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1190,15 +1190,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1228,15 +1228,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1274,15 +1274,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1303,15 +1303,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: flat_load_dword v5, v[5:6]
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -1344,21 +1344,21 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
+; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[8:9]
+; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1381,22 +1381,22 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1]
+; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -1427,21 +1427,21 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v5, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v4, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB11_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1451,24 +1451,24 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v6, v[0:1]
-; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v5, v[5:6]
+; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB11_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1504,21 +1504,20 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
-; GFX940-NEXT: v_mov_b32_e32 v0, s16
-; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX940-NEXT: v_mov_b32_e32 v2, s16
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
+; GFX940-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_max_f32_e32 v3, v1, v1
; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: v_max_f32_e32 v0, v2, v2
-; GFX940-NEXT: v_max_f32_e32 v1, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v1, v0
-; GFX940-NEXT: v_mov_b32_e32 v3, s16
+; GFX940-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX940-NEXT: v_min_f32_e32 v4, v0, v3
; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1554,20 +1553,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, s20
-; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v2, s20
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1
; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2
-; GFX90A-NEXT: v_max_f32_e32 v1, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s20
+; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX90A-NEXT: v_min_f32_e32 v4, v0, v3
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1581,24 +1579,23 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v2, s20
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
+; GFX908-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v3, v1, v1
; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_max_f32_e32 v0, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v1, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v1, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s20
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT: v_min_f32_e32 v4, v0, v3
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB12_1
@@ -1609,24 +1606,23 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s20
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen
+; GFX8-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v1
; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4
-; GFX8-NEXT: v_min_f32_e32 v3, v0, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s20
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT: v_min_f32_e32 v4, v0, v3
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB12_1
@@ -1664,24 +1660,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v1, s16
-; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
+; GFX940-NEXT: v_mov_b32_e32 v2, s16
+; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_max_f32_e32 v3, v0, v0
; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v1, v3, v3
-; GFX940-NEXT: v_max_f32_e32 v2, v0, v0
-; GFX940-NEXT: v_min_f32_e32 v2, v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v6, s16
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX940-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX940-NEXT: s_cbranch_execnz .LBB13_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1713,23 +1708,22 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v2, s20
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v1, v3, v3
-; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0
-; GFX90A-NEXT: v_min_f32_e32 v2, v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, s20
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1739,24 +1733,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v2, s20
+; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v3, v0, v0
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v3, v0, v0
-; GFX908-NEXT: v_min_f32_e32 v1, v1, v3
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v5, s20
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1766,24 +1759,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v2, s20
+; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, s20
-; GFX8-NEXT: v_mov_b32_e32 v3, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1810,27 +1802,26 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
-; GFX12-NEXT: v_mov_b32_e32 v0, s16
+; GFX12-NEXT: v_mov_b32_e32 v6, s16
+; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_mov_b32 s4, 0
-; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen
+; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
-; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[0:1]
-; GFX12-NEXT: v_mov_b32_e32 v10, s16
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1854,29 +1845,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
-; GFX11-NEXT: v_mov_b32_e32 v0, s16
+; GFX11-NEXT: v_mov_b32_e32 v6, s16
+; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
-; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1]
-; GFX11-NEXT: v_mov_b32_e32 v10, s16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1908,28 +1897,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: v_mov_b32_e32 v5, v1
-; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v6, s20
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v3, v1
+; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v9, v1
-; GFX908-NEXT: v_mov_b32_e32 v8, v0
-; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX908-NEXT: v_mov_b32_e32 v10, s20
-; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1]
-; GFX908-NEXT: v_mov_b32_e32 v0, v6
-; GFX908-NEXT: v_mov_b32_e32 v1, v7
-; GFX908-NEXT: v_mov_b32_e32 v2, v8
-; GFX908-NEXT: v_mov_b32_e32 v3, v9
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v10, v1
+; GFX908-NEXT: v_mov_b32_e32 v9, v0
+; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
+; GFX908-NEXT: v_mov_b32_e32 v2, v9
+; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB14_1
@@ -1940,28 +1928,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: v_mov_b32_e32 v5, v1
-; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v6, s20
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v10, s20
-; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
-; GFX8-NEXT: v_mov_b32_e32 v1, v7
-; GFX8-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NEXT: v_mov_b32_e32 v3, v9
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v10, v1
+; GFX8-NEXT: v_mov_b32_e32 v9, v0
+; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
+; GFX8-NEXT: v_mov_b32_e32 v2, v9
+; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB14_1
@@ -1989,26 +1976,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, s16
+; GFX12-NEXT: v_mov_b32_e32 v6, s16
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
; GFX12-NEXT: s_mov_b32 s4, 0
-; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen
+; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen
; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[0:1]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, v5
-; GFX12-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT: v_mov_b32_e32 v6, v2
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -2032,28 +2017,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, s16
+; GFX11-NEXT: v_mov_b32_e32 v6, s16
+; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen
-; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen
; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1]
+; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, v5
-; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_mov_b32_e32 v6, v2
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc
+; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2085,27 +2067,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, s20
-; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v6, s20
+; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
+; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1]
-; GFX908-NEXT: v_mov_b32_e32 v10, s20
-; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v9, v5
-; GFX908-NEXT: v_mov_b32_e32 v8, v4
-; GFX908-NEXT: v_mov_b32_e32 v7, v3
-; GFX908-NEXT: v_mov_b32_e32 v6, v2
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v10, v3
+; GFX908-NEXT: v_mov_b32_e32 v9, v2
+; GFX908-NEXT: v_mov_b32_e32 v8, v1
+; GFX908-NEXT: v_mov_b32_e32 v7, v0
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v6
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX908-NEXT: v_mov_b32_e32 v2, v7
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v5, v7
+; GFX908-NEXT: v_mov_b32_e32 v3, v8
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB15_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2115,27 +2096,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s20
-; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v6, s20
+; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
+; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v10, s20
-; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v9, v5
-; GFX8-NEXT: v_mov_b32_e32 v8, v4
-; GFX8-NEXT: v_mov_b32_e32 v7, v3
-; GFX8-NEXT: v_mov_b32_e32 v6, v2
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v10, v3
+; GFX8-NEXT: v_mov_b32_e32 v9, v2
+; GFX8-NEXT: v_mov_b32_e32 v8, v1
+; GFX8-NEXT: v_mov_b32_e32 v7, v0
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v6
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v2, v7
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v5, v7
+; GFX8-NEXT: v_mov_b32_e32 v3, v8
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB15_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 45a9f9d47acbaf..823db84a053b8c 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -557,11 +557,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: s_mul_hi_u32 s9, s0, s7
; GFX908-NEXT: s_mul_i32 s0, s0, s7
; GFX908-NEXT: s_add_i32 s1, s9, s1
-; GFX908-NEXT: s_lshl_b64 s[0:1], s[0:1], 5
+; GFX908-NEXT: s_lshl_b64 s[14:15], s[0:1], 5
; GFX908-NEXT: s_branch .LBB3_2
; GFX908-NEXT: .LBB3_1: ; %Flow20
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT: s_andn2_b64 vcc, exec, s[14:15]
+; GFX908-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; GFX908-NEXT: s_cbranch_vccz .LBB3_12
; GFX908-NEXT: .LBB3_2: ; %bb9
; GFX908-NEXT: ; =>This Loop Header: Depth=1
@@ -571,15 +571,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: ; %bb.3: ; %bb14
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
+; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
; GFX908-NEXT: s_mov_b32 s7, s6
+; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1]
; GFX908-NEXT: v_mov_b32_e32 v4, s6
+; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v6
; GFX908-NEXT: v_mov_b32_e32 v6, s6
; GFX908-NEXT: v_mov_b32_e32 v9, s7
; GFX908-NEXT: v_mov_b32_e32 v5, s7
; GFX908-NEXT: v_mov_b32_e32 v7, s7
; GFX908-NEXT: v_mov_b32_e32 v8, s6
-; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[4:5], 0
-; GFX908-NEXT: v_cmp_gt_i64_e64 s[16:17], s[4:5], -1
+; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
; GFX908-NEXT: v_mov_b32_e32 v11, v5
; GFX908-NEXT: s_mov_b64 s[18:19], s[10:11]
; GFX908-NEXT: v_mov_b32_e32 v10, v4
@@ -599,9 +601,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX908-NEXT: s_add_u32 s18, s18, s0
+; GFX908-NEXT: s_add_u32 s18, s18, s14
; GFX908-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[2:3]
-; GFX908-NEXT: s_addc_u32 s19, s19, s1
+; GFX908-NEXT: s_addc_u32 s19, s19, s15
; GFX908-NEXT: s_mov_b64 s[20:21], 0
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23]
; GFX908-NEXT: s_cbranch_vccz .LBB3_9
@@ -620,7 +622,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: ds_read_b64 v[12:13], v19
; GFX908-NEXT: ds_read_b64 v[14:15], v0
-; GFX908-NEXT: s_andn2_b64 vcc, exec, s[16:17]
+; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1]
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_cbranch_vccnz .LBB3_7
; GFX908-NEXT: ; %bb.6: ; %bb51
@@ -648,7 +650,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: s_mov_b64 s[20:21], -1
; GFX908-NEXT: s_branch .LBB3_4
; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
-; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15]
+; GFX908-NEXT: s_mov_b64 s[20:21], s[16:17]
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[20:21]
; GFX908-NEXT: s_cbranch_vccz .LBB3_4
; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
@@ -659,7 +661,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: s_xor_b64 s[16:17], s[20:21], -1
; GFX908-NEXT: .LBB3_10: ; %Flow19
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT: s_mov_b64 s[14:15], -1
+; GFX908-NEXT: s_mov_b64 s[0:1], -1
; GFX908-NEXT: s_and_b64 vcc, exec, s[16:17]
; GFX908-NEXT: s_cbranch_vccz .LBB3_1
; GFX908-NEXT: ; %bb.11: ; %bb12
@@ -668,7 +670,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: s_addc_u32 s5, s5, 0
; GFX908-NEXT: s_add_u32 s10, s10, s12
; GFX908-NEXT: s_addc_u32 s11, s11, s13
-; GFX908-NEXT: s_mov_b64 s[14:15], 0
+; GFX908-NEXT: s_mov_b64 s[0:1], 0
; GFX908-NEXT: s_branch .LBB3_1
; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock
; GFX908-NEXT: s_endpgm
@@ -718,11 +720,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s7
; GFX90A-NEXT: s_mul_i32 s0, s0, s7
; GFX90A-NEXT: s_add_i32 s1, s9, s1
-; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], 5
+; GFX90A-NEXT: s_lshl_b64 s[14:15], s[0:1], 5
; GFX90A-NEXT: s_branch .LBB3_2
; GFX90A-NEXT: .LBB3_1: ; %Flow20
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[14:15]
+; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_vccz .LBB3_12
; GFX90A-NEXT: .LBB3_2: ; %bb9
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
@@ -732,12 +734,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: ; %bb.3: ; %bb14
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
; GFX90A-NEXT: s_mov_b32 s7, s6
+; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_cmp_lt_i64_e64 s[14:15], s[4:5], 0
-; GFX90A-NEXT: v_cmp_gt_i64_e64 s[16:17], s[4:5], -1
+; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
; GFX90A-NEXT: s_mov_b64 s[18:19], s[10:11]
; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -756,8 +760,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT: s_add_u32 s18, s18, s0
-; GFX90A-NEXT: s_addc_u32 s19, s19, s1
+; GFX90A-NEXT: s_add_u32 s18, s18, s14
+; GFX90A-NEXT: s_addc_u32 s19, s19, s15
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[4:5]
; GFX90A-NEXT: s_mov_b64 s[20:21], 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23]
@@ -777,7 +781,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ds_read_b64 v[14:15], v19
; GFX90A-NEXT: ds_read_b64 v[16:17], v0
-; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[16:17]
+; GFX90A-NEXT: s_and_b64 vcc, exec, s[0:1]
; GFX90A-NEXT: ; kill: killed $sgpr20 killed $sgpr21
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7
@@ -798,7 +802,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: s_mov_b64 s[20:21], -1
; GFX90A-NEXT: s_branch .LBB3_4
; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
-; GFX90A-NEXT: s_mov_b64 s[20:21], s[14:15]
+; GFX90A-NEXT: s_mov_b64 s[20:21], s[16:17]
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[20:21]
; GFX90A-NEXT: s_cbranch_vccz .LBB3_4
; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
@@ -809,7 +813,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: s_xor_b64 s[16:17], s[20:21], -1
; GFX90A-NEXT: .LBB3_10: ; %Flow19
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT: s_mov_b64 s[14:15], -1
+; GFX90A-NEXT: s_mov_b64 s[0:1], -1
; GFX90A-NEXT: s_and_b64 vcc, exec, s[16:17]
; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
; GFX90A-NEXT: ; %bb.11: ; %bb12
@@ -818,7 +822,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: s_addc_u32 s5, s5, 0
; GFX90A-NEXT: s_add_u32 s10, s10, s12
; GFX90A-NEXT: s_addc_u32 s11, s11, s13
-; GFX90A-NEXT: s_mov_b64 s[14:15], 0
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_branch .LBB3_1
; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock
; GFX90A-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index 48c7a9f5e2d14e..e8f1619c5d418c 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -54,23 +54,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB0_1
@@ -95,18 +95,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB0_1
@@ -122,18 +122,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB0_1
@@ -149,18 +149,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v3, s6
; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB0_1
@@ -176,19 +176,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
+; GFX6-NEXT: v_mov_b32_e32 v3, s6
; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v0, v4
+; GFX6-NEXT: v_mov_b32_e32 v1, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB0_1
@@ -241,23 +241,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_mov_b32_e32 v4, v1
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB1_1
@@ -290,19 +290,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, v1
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v2, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB1_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -316,19 +316,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v3, s6
; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v4, v1
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v2, v3
+; GFX7-NEXT: v_mov_b32_e32 v2, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB1_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -342,20 +342,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
+; GFX6-NEXT: v_mov_b32_e32 v3, s6
; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_add_f32_e32 v1, v2, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
-; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v4, v1
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v2, v3
+; GFX6-NEXT: v_mov_b32_e32 v2, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB1_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -827,23 +827,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB3_1
@@ -859,12 +859,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -885,18 +885,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB3_1
@@ -912,18 +912,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB3_1
@@ -939,18 +939,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v3, s6
; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB3_1
@@ -966,19 +966,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
+; GFX6-NEXT: v_mov_b32_e32 v3, s6
; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v0, v4
+; GFX6-NEXT: v_mov_b32_e32 v1, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB3_1
@@ -1031,23 +1031,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_mov_b32_e32 v4, v1
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB4_1
@@ -1062,11 +1062,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_f32_e32 v2, v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -1087,19 +1087,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v4, v1
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v2, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB4_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1113,19 +1113,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, v1
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v2, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB4_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1139,19 +1139,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v3, s6
; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v4, v1
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v2, v3
+; GFX7-NEXT: v_mov_b32_e32 v2, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB4_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1165,20 +1165,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
+; GFX6-NEXT: v_mov_b32_e32 v3, s6
; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_add_f32_e32 v1, v2, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
-; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v4, v1
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v2, v3
+; GFX6-NEXT: v_mov_b32_e32 v2, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB4_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1218,24 +1218,25 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x400
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1249,23 +1250,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB5_1
@@ -1281,12 +1282,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -1307,18 +1308,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB5_1
@@ -1334,18 +1335,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB5_1
@@ -1361,18 +1362,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v3, s6
; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB5_1
@@ -1388,19 +1389,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
+; GFX6-NEXT: v_mov_b32_e32 v3, s6
; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v0, v4
+; GFX6-NEXT: v_mov_b32_e32 v1, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB5_1
@@ -1441,24 +1442,25 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x400
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1472,23 +1474,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB6_1
@@ -1504,12 +1506,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -1530,18 +1532,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB6_1
@@ -1557,18 +1559,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB6_1
@@ -1584,18 +1586,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v3, s6
; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB6_1
@@ -1611,19 +1613,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
+; GFX6-NEXT: v_mov_b32_e32 v3, s6
; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v0, v4
+; GFX6-NEXT: v_mov_b32_e32 v1, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB6_1
@@ -1664,24 +1666,25 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x400
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1695,23 +1698,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB7_1
@@ -1727,12 +1730,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -1753,18 +1756,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
@@ -1780,18 +1783,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB7_1
@@ -1807,18 +1810,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v3, s6
; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB7_1
@@ -1834,19 +1837,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
+; GFX6-NEXT: v_mov_b32_e32 v3, s6
; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v0, v4
+; GFX6-NEXT: v_mov_b32_e32 v1, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB7_1
@@ -1873,25 +1876,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800
+; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v10, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1916,25 +1919,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x800
+; GFX11-NEXT: s_add_i32 s4, s16, 0x800
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, s4
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX11-NEXT: v_mov_b32_e32 v10, s5
-; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1949,26 +1952,26 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: s_add_i32 s5, s20, 0x800
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_add_i32 s4, s20, 0x800
+; GFX10-NEXT: v_mov_b32_e32 v6, s4
; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v9, v1
-; GFX10-NEXT: v_mov_b32_e32 v8, v0
-; GFX10-NEXT: v_mov_b32_e32 v10, s5
+; GFX10-NEXT: v_mov_b32_e32 v10, v1
+; GFX10-NEXT: v_mov_b32_e32 v9, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v0, v6
-; GFX10-NEXT: v_mov_b32_e32 v1, v7
-; GFX10-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-NEXT: v_mov_b32_e32 v3, v9
-; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v0, v7
+; GFX10-NEXT: v_mov_b32_e32 v1, v8
+; GFX10-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-NEXT: v_mov_b32_e32 v3, v10
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB8_1
@@ -1994,21 +1997,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v6, s6
; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v9, v1
-; GFX908-NEXT: v_mov_b32_e32 v8, v0
-; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v10, s6
-; GFX908-NEXT: v_mov_b32_e32 v0, v6
-; GFX908-NEXT: v_mov_b32_e32 v1, v7
-; GFX908-NEXT: v_mov_b32_e32 v2, v8
-; GFX908-NEXT: v_mov_b32_e32 v3, v9
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v10, v1
+; GFX908-NEXT: v_mov_b32_e32 v9, v0
+; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
+; GFX908-NEXT: v_mov_b32_e32 v2, v9
+; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB8_1
@@ -2025,21 +2028,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v10, s6
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
-; GFX8-NEXT: v_mov_b32_e32 v1, v7
-; GFX8-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NEXT: v_mov_b32_e32 v3, v9
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v10, v1
+; GFX8-NEXT: v_mov_b32_e32 v9, v0
+; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
+; GFX8-NEXT: v_mov_b32_e32 v2, v9
+; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB8_1
@@ -2056,21 +2059,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX7-NEXT: s_add_i32 s6, s20, 0x800
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s6
; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v10, s6
-; GFX7-NEXT: v_mov_b32_e32 v0, v6
-; GFX7-NEXT: v_mov_b32_e32 v1, v7
-; GFX7-NEXT: v_mov_b32_e32 v2, v8
-; GFX7-NEXT: v_mov_b32_e32 v3, v9
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v10, v1
+; GFX7-NEXT: v_mov_b32_e32 v9, v0
+; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v7
+; GFX7-NEXT: v_mov_b32_e32 v1, v8
+; GFX7-NEXT: v_mov_b32_e32 v2, v9
+; GFX7-NEXT: v_mov_b32_e32 v3, v10
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB8_1
@@ -2087,22 +2090,22 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX6-NEXT: s_add_i32 s6, s20, 0x800
; GFX6-NEXT: s_mov_b64 s[4:5], 0
+; GFX6-NEXT: v_mov_b32_e32 v6, s6
; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v9, v1
-; GFX6-NEXT: v_mov_b32_e32 v8, v0
-; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v10, s6
+; GFX6-NEXT: v_mov_b32_e32 v10, v1
+; GFX6-NEXT: v_mov_b32_e32 v9, v0
+; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, v6
-; GFX6-NEXT: v_mov_b32_e32 v1, v7
-; GFX6-NEXT: v_mov_b32_e32 v2, v8
-; GFX6-NEXT: v_mov_b32_e32 v3, v9
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v0, v7
+; GFX6-NEXT: v_mov_b32_e32 v1, v8
+; GFX6-NEXT: v_mov_b32_e32 v2, v9
+; GFX6-NEXT: v_mov_b32_e32 v3, v10
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB8_1
@@ -2124,24 +2127,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, s16
-; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800
+; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f64_e32 v[2:3], v[4:5], v[0:1]
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5
-; GFX12-NEXT: v_mov_b32_e32 v8, v4
+; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2165,24 +2169,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x800
+; GFX11-NEXT: s_add_i32 s4, s16, 0x800
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, s4
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX11-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5
-; GFX11-NEXT: v_mov_b32_e32 v8, v4
+; GFX11-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2195,26 +2200,26 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x800
+; GFX10-NEXT: s_add_i32 s4, s20, 0x800
+; GFX10-NEXT: v_mov_b32_e32 v6, s4
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v9, v5
-; GFX10-NEXT: v_mov_b32_e32 v10, s5
-; GFX10-NEXT: v_mov_b32_e32 v8, v4
+; GFX10-NEXT: v_mov_b32_e32 v10, v5
+; GFX10-NEXT: v_mov_b32_e32 v9, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_mov_b32_e32 v7, v3
-; GFX10-NEXT: v_mov_b32_e32 v6, v2
-; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX10-NEXT: v_mov_b32_e32 v8, v3
+; GFX10-NEXT: v_mov_b32_e32 v7, v2
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v4, v6
-; GFX10-NEXT: v_mov_b32_e32 v5, v7
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v4, v7
+; GFX10-NEXT: v_mov_b32_e32 v5, v8
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB9_1
@@ -2238,22 +2243,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v6, s6
; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX908-NEXT: v_mov_b32_e32 v9, v5
-; GFX908-NEXT: v_mov_b32_e32 v10, s6
-; GFX908-NEXT: v_mov_b32_e32 v8, v4
-; GFX908-NEXT: v_mov_b32_e32 v7, v3
-; GFX908-NEXT: v_mov_b32_e32 v6, v2
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v10, v5
+; GFX908-NEXT: v_mov_b32_e32 v9, v4
+; GFX908-NEXT: v_mov_b32_e32 v8, v3
+; GFX908-NEXT: v_mov_b32_e32 v7, v2
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v6
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v7
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v5, v7
+; GFX908-NEXT: v_mov_b32_e32 v5, v8
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB9_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2267,22 +2272,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v9, v5
-; GFX8-NEXT: v_mov_b32_e32 v10, s6
-; GFX8-NEXT: v_mov_b32_e32 v8, v4
-; GFX8-NEXT: v_mov_b32_e32 v7, v3
-; GFX8-NEXT: v_mov_b32_e32 v6, v2
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v10, v5
+; GFX8-NEXT: v_mov_b32_e32 v9, v4
+; GFX8-NEXT: v_mov_b32_e32 v8, v3
+; GFX8-NEXT: v_mov_b32_e32 v7, v2
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v6
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v7
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v5, v7
+; GFX8-NEXT: v_mov_b32_e32 v5, v8
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB9_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2296,22 +2301,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
; GFX7-NEXT: s_add_i32 s6, s20, 0x800
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s6
; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX7-NEXT: v_mov_b32_e32 v9, v5
-; GFX7-NEXT: v_mov_b32_e32 v10, s6
-; GFX7-NEXT: v_mov_b32_e32 v8, v4
-; GFX7-NEXT: v_mov_b32_e32 v7, v3
-; GFX7-NEXT: v_mov_b32_e32 v6, v2
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v10, v5
+; GFX7-NEXT: v_mov_b32_e32 v9, v4
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_mov_b32_e32 v7, v2
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v6
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v4, v7
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v5, v7
+; GFX7-NEXT: v_mov_b32_e32 v5, v8
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB9_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2325,23 +2330,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
; GFX6-NEXT: s_add_i32 s6, s20, 0x800
; GFX6-NEXT: s_mov_b64 s[4:5], 0
+; GFX6-NEXT: v_mov_b32_e32 v6, s6
; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v9, v5
-; GFX6-NEXT: v_mov_b32_e32 v10, s6
-; GFX6-NEXT: v_mov_b32_e32 v8, v4
-; GFX6-NEXT: v_mov_b32_e32 v7, v3
-; GFX6-NEXT: v_mov_b32_e32 v6, v2
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v10, v5
+; GFX6-NEXT: v_mov_b32_e32 v9, v4
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
+; GFX6-NEXT: v_mov_b32_e32 v7, v2
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v4, v6
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v4, v7
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v5, v7
+; GFX6-NEXT: v_mov_b32_e32 v5, v8
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB9_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2898,25 +2903,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800
+; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v10, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2941,25 +2946,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x800
+; GFX11-NEXT: s_add_i32 s4, s16, 0x800
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, s4
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX11-NEXT: v_mov_b32_e32 v10, s5
-; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2974,26 +2979,26 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: s_add_i32 s5, s20, 0x800
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_add_i32 s4, s20, 0x800
+; GFX10-NEXT: v_mov_b32_e32 v6, s4
; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v9, v1
-; GFX10-NEXT: v_mov_b32_e32 v8, v0
-; GFX10-NEXT: v_mov_b32_e32 v10, s5
+; GFX10-NEXT: v_mov_b32_e32 v10, v1
+; GFX10-NEXT: v_mov_b32_e32 v9, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v0, v6
-; GFX10-NEXT: v_mov_b32_e32 v1, v7
-; GFX10-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-NEXT: v_mov_b32_e32 v3, v9
-; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v0, v7
+; GFX10-NEXT: v_mov_b32_e32 v1, v8
+; GFX10-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-NEXT: v_mov_b32_e32 v3, v10
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB11_1
@@ -3010,18 +3015,18 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX90A-NEXT: s_add_i32 s6, s20, 0x800
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, s6
; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v10, s6
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB11_1
@@ -3038,21 +3043,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v6, s6
; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v9, v1
-; GFX908-NEXT: v_mov_b32_e32 v8, v0
-; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v10, s6
-; GFX908-NEXT: v_mov_b32_e32 v0, v6
-; GFX908-NEXT: v_mov_b32_e32 v1, v7
-; GFX908-NEXT: v_mov_b32_e32 v2, v8
-; GFX908-NEXT: v_mov_b32_e32 v3, v9
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v10, v1
+; GFX908-NEXT: v_mov_b32_e32 v9, v0
+; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
+; GFX908-NEXT: v_mov_b32_e32 v2, v9
+; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB11_1
@@ -3069,21 +3074,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v10, s6
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
-; GFX8-NEXT: v_mov_b32_e32 v1, v7
-; GFX8-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NEXT: v_mov_b32_e32 v3, v9
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v10, v1
+; GFX8-NEXT: v_mov_b32_e32 v9, v0
+; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
+; GFX8-NEXT: v_mov_b32_e32 v2, v9
+; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB11_1
@@ -3100,21 +3105,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX7-NEXT: s_add_i32 s6, s20, 0x800
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s6
; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v10, s6
-; GFX7-NEXT: v_mov_b32_e32 v0, v6
-; GFX7-NEXT: v_mov_b32_e32 v1, v7
-; GFX7-NEXT: v_mov_b32_e32 v2, v8
-; GFX7-NEXT: v_mov_b32_e32 v3, v9
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v10, v1
+; GFX7-NEXT: v_mov_b32_e32 v9, v0
+; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v7
+; GFX7-NEXT: v_mov_b32_e32 v1, v8
+; GFX7-NEXT: v_mov_b32_e32 v2, v9
+; GFX7-NEXT: v_mov_b32_e32 v3, v10
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB11_1
@@ -3131,22 +3136,22 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX6-NEXT: s_add_i32 s6, s20, 0x800
; GFX6-NEXT: s_mov_b64 s[4:5], 0
+; GFX6-NEXT: v_mov_b32_e32 v6, s6
; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v9, v1
-; GFX6-NEXT: v_mov_b32_e32 v8, v0
-; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v10, s6
+; GFX6-NEXT: v_mov_b32_e32 v10, v1
+; GFX6-NEXT: v_mov_b32_e32 v9, v0
+; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, v6
-; GFX6-NEXT: v_mov_b32_e32 v1, v7
-; GFX6-NEXT: v_mov_b32_e32 v2, v8
-; GFX6-NEXT: v_mov_b32_e32 v3, v9
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v0, v7
+; GFX6-NEXT: v_mov_b32_e32 v1, v8
+; GFX6-NEXT: v_mov_b32_e32 v2, v9
+; GFX6-NEXT: v_mov_b32_e32 v3, v10
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB11_1
@@ -3169,25 +3174,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800
+; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v10, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -3212,25 +3217,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x800
+; GFX11-NEXT: s_add_i32 s4, s16, 0x800
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, s4
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX11-NEXT: v_mov_b32_e32 v10, s5
-; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -3245,26 +3250,26 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: s_add_i32 s5, s20, 0x800
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_add_i32 s4, s20, 0x800
+; GFX10-NEXT: v_mov_b32_e32 v6, s4
; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v9, v1
-; GFX10-NEXT: v_mov_b32_e32 v8, v0
-; GFX10-NEXT: v_mov_b32_e32 v10, s5
+; GFX10-NEXT: v_mov_b32_e32 v10, v1
+; GFX10-NEXT: v_mov_b32_e32 v9, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v0, v6
-; GFX10-NEXT: v_mov_b32_e32 v1, v7
-; GFX10-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-NEXT: v_mov_b32_e32 v3, v9
-; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v0, v7
+; GFX10-NEXT: v_mov_b32_e32 v1, v8
+; GFX10-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-NEXT: v_mov_b32_e32 v3, v10
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB12_1
@@ -3290,21 +3295,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v6, s6
; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v9, v1
-; GFX908-NEXT: v_mov_b32_e32 v8, v0
-; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v10, s6
-; GFX908-NEXT: v_mov_b32_e32 v0, v6
-; GFX908-NEXT: v_mov_b32_e32 v1, v7
-; GFX908-NEXT: v_mov_b32_e32 v2, v8
-; GFX908-NEXT: v_mov_b32_e32 v3, v9
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v10, v1
+; GFX908-NEXT: v_mov_b32_e32 v9, v0
+; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
+; GFX908-NEXT: v_mov_b32_e32 v2, v9
+; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB12_1
@@ -3321,21 +3326,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v10, s6
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
-; GFX8-NEXT: v_mov_b32_e32 v1, v7
-; GFX8-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NEXT: v_mov_b32_e32 v3, v9
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v10, v1
+; GFX8-NEXT: v_mov_b32_e32 v9, v0
+; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
+; GFX8-NEXT: v_mov_b32_e32 v2, v9
+; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB12_1
@@ -3352,21 +3357,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX7-NEXT: s_add_i32 s6, s20, 0x800
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s6
; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v10, s6
-; GFX7-NEXT: v_mov_b32_e32 v0, v6
-; GFX7-NEXT: v_mov_b32_e32 v1, v7
-; GFX7-NEXT: v_mov_b32_e32 v2, v8
-; GFX7-NEXT: v_mov_b32_e32 v3, v9
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v10, v1
+; GFX7-NEXT: v_mov_b32_e32 v9, v0
+; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v7
+; GFX7-NEXT: v_mov_b32_e32 v1, v8
+; GFX7-NEXT: v_mov_b32_e32 v2, v9
+; GFX7-NEXT: v_mov_b32_e32 v3, v10
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB12_1
@@ -3383,22 +3388,22 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX6-NEXT: s_add_i32 s6, s20, 0x800
; GFX6-NEXT: s_mov_b64 s[4:5], 0
+; GFX6-NEXT: v_mov_b32_e32 v6, s6
; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v9, v1
-; GFX6-NEXT: v_mov_b32_e32 v8, v0
-; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v10, s6
+; GFX6-NEXT: v_mov_b32_e32 v10, v1
+; GFX6-NEXT: v_mov_b32_e32 v9, v0
+; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, v6
-; GFX6-NEXT: v_mov_b32_e32 v1, v7
-; GFX6-NEXT: v_mov_b32_e32 v2, v8
-; GFX6-NEXT: v_mov_b32_e32 v3, v9
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v0, v7
+; GFX6-NEXT: v_mov_b32_e32 v1, v8
+; GFX6-NEXT: v_mov_b32_e32 v2, v9
+; GFX6-NEXT: v_mov_b32_e32 v3, v10
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB12_1
@@ -3426,43 +3431,43 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-NEXT: s_addk_co_i32 s16, 0x200
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_and_b32 s5, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v1, s4
-; GFX12-NEXT: s_lshl_b32 s5, s5, 3
+; GFX12-NEXT: v_mov_b32_e32 v5, s4
+; GFX12-NEXT: s_and_b32 s4, s16, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_not_b32 s7, s6
-; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen
-; GFX12-NEXT: s_mov_b32 s6, 0
+; GFX12-NEXT: s_not_b32 s6, s5
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v1, s5, v2
+; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_f16_e32 v1, v1, v0
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, s5, v1
+; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_mov_b32_e32 v3, v1
+; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v3
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -3470,25 +3475,24 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s16, 0x200
-; GFX940-NEXT: s_and_b32 s6, s16, -4
-; GFX940-NEXT: v_mov_b32_e32 v1, s6
+; GFX940-NEXT: s_and_b32 s4, s16, -4
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
; GFX940-NEXT: s_and_b32 s4, s16, 3
-; GFX940-NEXT: s_lshl_b32 s7, s4, 3
-; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT: s_not_b32 s8, s4
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v1, s7, v3
-; GFX940-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX940-NEXT: v_and_or_b32 v2, v3, s8, v1
-; GFX940-NEXT: v_mov_b32_e32 v6, s6
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3
+; GFX940-NEXT: v_add_f16_e32 v2, v2, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2
+; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2
; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -3498,51 +3502,49 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX940-NEXT: s_cbranch_execnz .LBB13_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, s7, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_and_b32 s5, s16, 3
-; GFX11-NEXT: v_mov_b32_e32 v1, s4
-; GFX11-NEXT: s_lshl_b32 s5, s5, 3
-; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_not_b32 s7, s6
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen
-; GFX11-NEXT: s_mov_b32 s6, 0
-; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: v_mov_b32_e32 v5, s4
+; GFX11-NEXT: s_and_b32 s4, s16, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen
+; GFX11-NEXT: s_not_b32 s6, s5
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, s5, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f16_e32 v1, v1, v0
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, s5, v1
-; GFX11-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v3, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB13_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v3
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3550,22 +3552,21 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: s_and_b32 s5, s20, 3
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: s_lshl_b32 s5, s5, 3
-; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT: s_not_b32 s7, s6
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
-; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: v_mov_b32_e32 v5, s4
+; GFX10-NEXT: s_and_b32 s4, s20, 3
+; GFX10-NEXT: s_lshl_b32 s4, s4, 3
+; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen
+; GFX10-NEXT: s_not_b32 s6, s5
+; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2
-; GFX10-NEXT: v_mov_b32_e32 v5, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1
+; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX10-NEXT: v_mov_b32_e32 v4, v2
; GFX10-NEXT: v_mov_b32_e32 v3, v1
; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
@@ -3574,36 +3575,35 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX10-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB13_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v3
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
-; GFX90A-NEXT: s_and_b32 s6, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
+; GFX90A-NEXT: s_and_b32 s4, s20, -4
+; GFX90A-NEXT: v_mov_b32_e32 v1, s4
; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
-; GFX90A-NEXT: s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT: s_not_b32 s8, s4
+; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3
-; GFX90A-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, s6
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3
+; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -3613,30 +3613,29 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
-; GFX908-NEXT: s_and_b32 s6, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v1, s6
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX908-NEXT: s_and_b32 s4, s20, -4
+; GFX908-NEXT: v_mov_b32_e32 v5, s4
+; GFX908-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
-; GFX908-NEXT: s_lshl_b32 s7, s4, 3
-; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT: s_not_b32 s8, s4
+; GFX908-NEXT: s_lshl_b32 s6, s4, 3
+; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2
+; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2
; GFX908-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1
; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -3648,31 +3647,30 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v3
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
-; GFX8-NEXT: s_and_b32 s6, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v1, s6
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX8-NEXT: s_and_b32 s4, s20, -4
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
-; GFX8-NEXT: s_lshl_b32 s7, s4, 3
-; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT: s_not_b32 s8, s4
+; GFX8-NEXT: s_lshl_b32 s6, s4, 3
+; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2
; GFX8-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX8-NEXT: v_and_b32_e32 v3, s8, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1
+; GFX8-NEXT: v_and_b32_e32 v3, s7, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1
; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -3684,37 +3682,36 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
-; GFX7-NEXT: s_and_b32 s6, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v1, s6
-; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT: s_and_b32 s4, s20, -4
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_and_b32 s4, s20, 3
-; GFX7-NEXT: s_lshl_b32 s7, s4, 3
-; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0
-; GFX7-NEXT: s_not_b32 s8, s4
+; GFX7-NEXT: s_lshl_b32 s6, s4, 3
+; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v2, s8, v1
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_and_b32_e32 v2, s7, v1
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -3724,7 +3721,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX7-NEXT: s_cbranch_execnz .LBB13_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -3732,31 +3729,30 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
-; GFX6-NEXT: s_and_b32 s6, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v1, s6
-; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT: s_and_b32 s4, s20, -4
+; GFX6-NEXT: v_mov_b32_e32 v4, s4
+; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_and_b32 s4, s20, 3
-; GFX6-NEXT: s_lshl_b32 s7, s4, 3
-; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v0
-; GFX6-NEXT: s_not_b32 s8, s4
+; GFX6-NEXT: s_lshl_b32 s6, s4, 3
+; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, s8, v1
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
-; GFX6-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX6-NEXT: v_and_b32_e32 v2, s7, v1
+; GFX6-NEXT: v_add_f32_e32 v0, v0, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
; GFX6-NEXT: v_mov_b32_e32 v3, v1
; GFX6-NEXT: v_mov_b32_e32 v2, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -3766,7 +3762,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX6-NEXT: s_cbranch_execnz .LBB13_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -3786,42 +3782,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX12-NEXT: s_addk_co_i32 s16, 0x200
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_and_b32 s5, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v1, s4
-; GFX12-NEXT: s_lshl_b32 s5, s5, 3
+; GFX12-NEXT: v_mov_b32_e32 v3, s4
+; GFX12-NEXT: s_and_b32 s4, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_not_b32 s7, s6
-; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen
-; GFX12-NEXT: s_mov_b32 s6, 0
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_not_b32 s6, s5
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v1, s5, v2
+; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_f16_e32 v1, v1, v0
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, s5, v1
+; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_mov_b32_e32 v3, v1
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -3829,25 +3825,24 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s16, 0x200
-; GFX940-NEXT: s_and_b32 s6, s16, -4
-; GFX940-NEXT: v_mov_b32_e32 v1, s6
+; GFX940-NEXT: s_and_b32 s4, s16, -4
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
; GFX940-NEXT: s_and_b32 s4, s16, 3
-; GFX940-NEXT: s_lshl_b32 s7, s4, 3
-; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT: s_not_b32 s8, s4
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v1, s7, v3
-; GFX940-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX940-NEXT: v_and_or_b32 v2, v3, s8, v1
-; GFX940-NEXT: v_mov_b32_e32 v6, s6
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3
+; GFX940-NEXT: v_add_f16_e32 v2, v2, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2
+; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2
; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -3863,43 +3858,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_and_b32 s5, s16, 3
-; GFX11-NEXT: v_mov_b32_e32 v1, s4
-; GFX11-NEXT: s_lshl_b32 s5, s5, 3
-; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_not_b32 s7, s6
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen
-; GFX11-NEXT: s_mov_b32 s6, 0
-; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-NEXT: s_and_b32 s4, s16, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen
+; GFX11-NEXT: s_not_b32 s6, s5
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, s5, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f16_e32 v1, v1, v0
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, s5, v1
-; GFX11-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v3, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB14_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3907,59 +3900,57 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: s_and_b32 s5, s20, 3
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: s_lshl_b32 s5, s5, 3
-; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT: s_not_b32 s7, s6
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
-; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: s_and_b32 s4, s20, 3
+; GFX10-NEXT: s_lshl_b32 s4, s4, 3
+; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen
+; GFX10-NEXT: s_not_b32 s6, s5
+; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2
-; GFX10-NEXT: v_mov_b32_e32 v5, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX10-NEXT: v_mov_b32_e32 v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v4, v1
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB14_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
-; GFX90A-NEXT: s_and_b32 s6, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
+; GFX90A-NEXT: s_and_b32 s4, s20, -4
+; GFX90A-NEXT: v_mov_b32_e32 v1, s4
; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
-; GFX90A-NEXT: s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT: s_not_b32 s8, s4
+; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3
-; GFX90A-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, s6
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3
+; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -3975,30 +3966,29 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
-; GFX908-NEXT: s_and_b32 s6, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v1, s6
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX908-NEXT: s_and_b32 s4, s20, -4
+; GFX908-NEXT: v_mov_b32_e32 v3, s4
+; GFX908-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
-; GFX908-NEXT: s_lshl_b32 s7, s4, 3
-; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT: s_not_b32 s8, s4
+; GFX908-NEXT: s_lshl_b32 s6, s4, 3
+; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2
+; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2
; GFX908-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1
+; GFX908-NEXT: v_mov_b32_e32 v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v4, v1
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v2, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB14_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4009,31 +3999,30 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
-; GFX8-NEXT: s_and_b32 s6, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v1, s6
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX8-NEXT: s_and_b32 s4, s20, -4
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
-; GFX8-NEXT: s_lshl_b32 s7, s4, 3
-; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT: s_not_b32 s8, s4
+; GFX8-NEXT: s_lshl_b32 s6, s4, 3
+; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2
; GFX8-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX8-NEXT: v_and_b32_e32 v3, s8, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_and_b32_e32 v4, s7, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, v1
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v2, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB14_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4044,35 +4033,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
-; GFX7-NEXT: s_and_b32 s6, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v1, s6
-; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT: s_and_b32 s4, s20, -4
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_and_b32 s4, s20, 3
-; GFX7-NEXT: s_lshl_b32 s7, s4, 3
-; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX7-NEXT: s_not_b32 s8, s4
+; GFX7-NEXT: s_lshl_b32 s6, s4, 3
+; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v3, s8, v1
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_and_b32_e32 v4, s7, v1
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_mov_b32_e32 v4, v0
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v3
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB14_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4083,36 +4071,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
-; GFX6-NEXT: s_and_b32 s6, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v1, s6
-; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT: s_and_b32 s4, s20, -4
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_and_b32 s4, s20, 3
-; GFX6-NEXT: s_lshl_b32 s7, s4, 3
-; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-NEXT: s_not_b32 s8, s4
+; GFX6-NEXT: s_lshl_b32 s6, s4, 3
+; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v3, s8, v1
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
-; GFX6-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_and_b32_e32 v4, s7, v1
+; GFX6-NEXT: v_add_f32_e32 v0, v0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
+; GFX6-NEXT: v_mov_b32_e32 v4, v0
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v3
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB14_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4781,27 +4768,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_and_b32 s5, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v1, s4
-; GFX12-NEXT: s_lshl_b32 s5, s5, 3
+; GFX12-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-NEXT: s_and_b32 s4, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_not_b32 s7, s6
-; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen
-; GFX12-NEXT: s_mov_b32 s6, 0
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_not_b32 s6, s5
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX12-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-NEXT: v_add_f32_e32 v0, v0, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0
@@ -4811,23 +4799,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, s5, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v2
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -4835,33 +4823,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s16, 0x200
-; GFX940-NEXT: s_and_b32 s6, s16, -4
-; GFX940-NEXT: v_mov_b32_e32 v1, s6
-; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX940-NEXT: s_and_b32 s4, s16, -4
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
+; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen
; GFX940-NEXT: s_and_b32 s4, s16, 3
-; GFX940-NEXT: s_lshl_b32 s7, s4, 3
-; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT: s_not_b32 s8, s4
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX940-NEXT: s_movk_i32 s9, 0x7fff
+; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: v_mov_b32_e32 v5, s6
-; GFX940-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_add_f32_e32 v0, v0, v5
; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v2, v2, v0, s9
+; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v0, v1, s8, v0
+; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
-; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4871,32 +4858,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX940-NEXT: s_cbranch_execnz .LBB16_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_and_b32 s5, s16, 3
-; GFX11-NEXT: v_mov_b32_e32 v1, s4
-; GFX11-NEXT: s_lshl_b32 s5, s5, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5
-; GFX11-NEXT: s_not_b32 s7, s6
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen
-; GFX11-NEXT: s_mov_b32 s6, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-NEXT: s_and_b32 s4, s16, 3
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-NEXT: s_not_b32 s6, s5
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0
@@ -4906,97 +4894,95 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, s5, v0
-; GFX11-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB16_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: s_and_b32 s5, s20, 3
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: s_lshl_b32 s5, s5, 3
-; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT: s_not_b32 s7, s6
-; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
-; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: s_and_b32 s4, s20, 3
+; GFX10-NEXT: s_lshl_b32 s4, s4, 3
+; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX10-NEXT: s_not_b32 s6, s5
+; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_mov_b32_e32 v5, s4
+; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v5
; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX10-NEXT: v_mov_b32_e32 v3, v1
; GFX10-NEXT: v_mov_b32_e32 v2, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v2
-; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB16_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v2
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
-; GFX90A-NEXT: s_and_b32 s6, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX90A-NEXT: s_and_b32 s4, s20, -4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
-; GFX90A-NEXT: s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT: s_not_b32 s8, s4
+; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX90A-NEXT: s_movk_i32 s9, 0x7fff
+; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v5
; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s9
+; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, s6
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -5006,39 +4992,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
-; GFX908-NEXT: s_and_b32 s6, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v1, s6
-; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX908-NEXT: s_and_b32 s4, s20, -4
+; GFX908-NEXT: v_mov_b32_e32 v4, s4
+; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
-; GFX908-NEXT: s_lshl_b32 s7, s4, 3
-; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT: s_not_b32 s8, s4
+; GFX908-NEXT: s_lshl_b32 s6, s4, 3
+; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX908-NEXT: s_movk_i32 s9, 0x7fff
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v5
; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v2, v2, v0, s9
+; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0
+; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
; GFX908-NEXT: v_mov_b32_e32 v2, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -5048,41 +5033,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX908-NEXT: s_cbranch_execnz .LBB16_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
-; GFX8-NEXT: s_and_b32 s6, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v1, s6
-; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX8-NEXT: s_and_b32 s4, s20, -4
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
-; GFX8-NEXT: s_lshl_b32 s7, s4, 3
-; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT: s_not_b32 s8, s4
+; GFX8-NEXT: s_lshl_b32 s6, s4, 3
+; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v2, s8, v1
+; GFX8-NEXT: v_and_b32_e32 v2, s7, v1
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -5092,37 +5076,36 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX8-NEXT: s_cbranch_execnz .LBB16_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
-; GFX7-NEXT: s_and_b32 s6, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v1, s6
-; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT: s_and_b32 s4, s20, -4
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX7-NEXT: s_and_b32 s4, s20, 3
-; GFX7-NEXT: s_lshl_b32 s7, s4, 3
-; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7
+; GFX7-NEXT: s_lshl_b32 s6, s4, 3
+; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: s_not_b32 s8, s4
+; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v5
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v2, s8, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0
+; GFX7-NEXT: v_and_b32_e32 v2, s7, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -5132,7 +5115,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX7-NEXT: s_cbranch_execnz .LBB16_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -5140,31 +5123,30 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
-; GFX6-NEXT: s_and_b32 s6, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v1, s6
-; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT: s_and_b32 s4, s20, -4
+; GFX6-NEXT: v_mov_b32_e32 v4, s4
+; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX6-NEXT: s_and_b32 s4, s20, 3
-; GFX6-NEXT: s_lshl_b32 s7, s4, 3
-; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7
+; GFX6-NEXT: s_lshl_b32 s6, s4, 3
+; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: s_not_b32 s8, s4
+; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX6-NEXT: v_add_f32_e32 v0, v0, v5
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, s8, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0
+; GFX6-NEXT: v_and_b32_e32 v2, s7, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
; GFX6-NEXT: v_mov_b32_e32 v2, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -5174,7 +5156,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX6-NEXT: s_cbranch_execnz .LBB16_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -5192,52 +5174,53 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_and_b32 s5, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v1, s4
-; GFX12-NEXT: s_lshl_b32 s5, s5, 3
+; GFX12-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-NEXT: s_and_b32 s4, s16, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_not_b32 s7, s6
-; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen
-; GFX12-NEXT: s_mov_b32 s6, 0
+; GFX12-NEXT: s_not_b32 s6, s5
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX12-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-NEXT: v_add_f32_e32 v0, v0, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, s5, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v3
-; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -5245,33 +5228,32 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s16, 0x200
-; GFX940-NEXT: s_and_b32 s6, s16, -4
-; GFX940-NEXT: v_mov_b32_e32 v1, s6
-; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX940-NEXT: s_and_b32 s4, s16, -4
+; GFX940-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
; GFX940-NEXT: s_and_b32 s4, s16, 3
-; GFX940-NEXT: s_lshl_b32 s7, s4, 3
-; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT: s_not_b32 s8, s4
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX940-NEXT: s_movk_i32 s9, 0x7fff
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: v_mov_b32_e32 v3, s6
-; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_add_f32_e32 v0, v0, v3
; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v4, v4, v0, s9
+; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v0, v1, s8, v0
+; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
@@ -5287,123 +5269,122 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_and_b32 s5, s16, 3
-; GFX11-NEXT: v_mov_b32_e32 v1, s4
-; GFX11-NEXT: s_lshl_b32 s5, s5, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5
-; GFX11-NEXT: s_not_b32 s7, s6
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen
-; GFX11-NEXT: s_mov_b32 s6, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-NEXT: s_and_b32 s4, s16, 3
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-NEXT: s_not_b32 s6, s5
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, s5, v0
-; GFX11-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v3
-; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB17_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: s_and_b32 s5, s20, 3
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: s_lshl_b32 s5, s5, 3
-; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT: s_not_b32 s7, s6
-; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
-; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-NEXT: s_and_b32 s4, s20, 3
+; GFX10-NEXT: s_lshl_b32 s4, s4, 3
+; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX10-NEXT: s_not_b32 s6, s5
+; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_mov_b32_e32 v5, s4
+; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: v_mov_b32_e32 v3, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
+; GFX10-NEXT: v_mov_b32_e32 v4, v0
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v3
-; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB17_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
-; GFX90A-NEXT: s_and_b32 s6, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX90A-NEXT: s_and_b32 s4, s20, -4
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
-; GFX90A-NEXT: s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT: s_not_b32 s8, s4
+; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX90A-NEXT: s_movk_i32 s9, 0x7fff
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3
; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s9
+; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
@@ -5419,37 +5400,36 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
-; GFX908-NEXT: s_and_b32 s6, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v1, s6
-; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX908-NEXT: s_and_b32 s4, s20, -4
+; GFX908-NEXT: v_mov_b32_e32 v2, s4
+; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
-; GFX908-NEXT: s_lshl_b32 s7, s4, 3
-; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT: s_not_b32 s8, s4
+; GFX908-NEXT: s_lshl_b32 s6, s4, 3
+; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX908-NEXT: s_movk_i32 s9, 0x7fff
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX908-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v3, v3, v0, s9
+; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0
-; GFX908-NEXT: v_mov_b32_e32 v4, v1
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v3, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v3
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB17_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5460,39 +5440,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
-; GFX8-NEXT: s_and_b32 s6, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v1, s6
-; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX8-NEXT: s_and_b32 s4, s20, -4
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
-; GFX8-NEXT: s_lshl_b32 s7, s4, 3
-; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT: s_not_b32 s8, s4
+; GFX8-NEXT: s_lshl_b32 s6, s4, 3
+; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v4, v4, v2
-; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v3, s8, v1
-; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, s7, v1
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB17_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5503,35 +5482,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
-; GFX7-NEXT: s_and_b32 s6, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v1, s6
-; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT: s_and_b32 s4, s20, -4
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX7-NEXT: s_and_b32 s4, s20, 3
-; GFX7-NEXT: s_lshl_b32 s7, s4, 3
-; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7
+; GFX7-NEXT: s_lshl_b32 s6, s4, 3
+; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: s_not_b32 s8, s4
+; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v3, s8, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: v_and_b32_e32 v4, s7, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_mov_b32_e32 v4, v0
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v3
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB17_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5542,36 +5520,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
-; GFX6-NEXT: s_and_b32 s6, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v1, s6
-; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT: s_and_b32 s4, s20, -4
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX6-NEXT: s_and_b32 s4, s20, 3
-; GFX6-NEXT: s_lshl_b32 s7, s4, 3
-; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7
+; GFX6-NEXT: s_lshl_b32 s6, s4, 3
+; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: s_not_b32 s8, s4
+; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_add_f32_e32 v0, v0, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v3, s8, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: v_and_b32_e32 v4, s7, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
+; GFX6-NEXT: v_mov_b32_e32 v4, v0
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v3
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB17_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6316,24 +6293,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x400
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -6347,23 +6325,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB19_1
@@ -6388,18 +6366,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB19_1
@@ -6415,20 +6393,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_add_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v1, v4, v2
-; GFX8-NEXT: v_or_b32_e32 v3, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB19_1
@@ -6451,30 +6429,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_mov_b32_e32 v8, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v2
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5
-; GFX7-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT: v_or_b32_e32 v4, v6, v0
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v0
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB19_1
@@ -6497,31 +6475,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_mov_b32_e32 v8, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v2
-; GFX6-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5
-; GFX6-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX6-NEXT: v_or_b32_e32 v4, v6, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6
+; GFX6-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX6-NEXT: v_or_b32_e32 v5, v7, v0
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB19_1
@@ -6563,23 +6541,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x400
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, s4
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v3, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_mov_b32_e32 v4, v1
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, v4
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -6592,23 +6572,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_mov_b32_e32 v4, v1
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB20_1
@@ -6641,21 +6621,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v3, v2, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, v1
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v2, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB20_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6666,41 +6646,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_mov_b32_e32 v7, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v0
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_or_b32_e32 v4, v2, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_or_b32_e32 v5, v3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v6, v3
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB20_1
@@ -6712,42 +6692,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_mov_b32_e32 v7, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v0
-; GFX6-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX6-NEXT: v_or_b32_e32 v4, v2, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT: v_or_b32_e32 v5, v3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v6, v3
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB20_1
@@ -7288,24 +7268,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x400
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -7319,23 +7300,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB22_1
@@ -7351,12 +7332,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -7377,18 +7358,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB22_1
@@ -7404,20 +7385,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_add_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v1, v4, v2
-; GFX8-NEXT: v_or_b32_e32 v3, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB22_1
@@ -7440,30 +7421,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_mov_b32_e32 v8, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v2
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5
-; GFX7-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT: v_or_b32_e32 v4, v6, v0
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v0
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB22_1
@@ -7486,31 +7467,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_mov_b32_e32 v8, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v2
-; GFX6-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5
-; GFX6-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX6-NEXT: v_or_b32_e32 v4, v6, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6
+; GFX6-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX6-NEXT: v_or_b32_e32 v5, v7, v0
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB22_1
@@ -7552,23 +7533,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x400
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, s4
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v3, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_mov_b32_e32 v4, v1
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, v4
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -7581,23 +7564,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_mov_b32_e32 v4, v1
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB23_1
@@ -7612,11 +7595,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -7637,19 +7620,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v4, v1
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v2, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB23_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7663,21 +7646,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v3, v2, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, v1
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v2, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB23_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7688,41 +7671,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_mov_b32_e32 v7, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v0
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_or_b32_e32 v4, v2, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_or_b32_e32 v5, v3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v6, v3
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB23_1
@@ -7734,42 +7717,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_mov_b32_e32 v7, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v0
-; GFX6-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX6-NEXT: v_or_b32_e32 v4, v2, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT: v_or_b32_e32 v5, v3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v6, v3
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB23_1
@@ -7810,24 +7793,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x400
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -7841,23 +7825,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB24_1
@@ -7873,12 +7857,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -7899,18 +7883,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB24_1
@@ -7926,20 +7910,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_add_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v1, v4, v2
-; GFX8-NEXT: v_or_b32_e32 v3, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB24_1
@@ -7962,30 +7946,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_mov_b32_e32 v8, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v2
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5
-; GFX7-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT: v_or_b32_e32 v4, v6, v0
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v0
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB24_1
@@ -8008,31 +7992,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_mov_b32_e32 v8, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v2
-; GFX6-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5
-; GFX6-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX6-NEXT: v_or_b32_e32 v4, v6, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6
+; GFX6-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX6-NEXT: v_or_b32_e32 v5, v7, v0
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB24_1
@@ -8074,23 +8058,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x400
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, s4
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v3, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_mov_b32_e32 v4, v1
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, v4
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -8103,23 +8089,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_mov_b32_e32 v4, v1
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB25_1
@@ -8134,11 +8120,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -8159,19 +8145,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v4, v1
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v2, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB25_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8185,21 +8171,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v3, v2, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, v1
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v2, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB25_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8210,41 +8196,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_mov_b32_e32 v7, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v0
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_or_b32_e32 v4, v2, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_or_b32_e32 v5, v3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v6, v3
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB25_1
@@ -8256,42 +8242,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_mov_b32_e32 v7, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v0
-; GFX6-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX6-NEXT: v_or_b32_e32 v4, v2, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT: v_or_b32_e32 v5, v3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v6, v3
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB25_1
@@ -8326,41 +8312,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: v_mov_b32_e32 v0, s16
; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX940-NEXT: s_add_i32 s8, s16, 0x400
+; GFX940-NEXT: s_add_i32 s4, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[6:7], 0
-; GFX940-NEXT: s_movk_i32 s9, 0x7fff
-; GFX940-NEXT: s_mov_b32 s10, 0x7060302
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX940-NEXT: s_mov_b32 s9, 0x7060302
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX940-NEXT: v_add_f32_e32 v1, v4, v1
-; GFX940-NEXT: v_add_f32_e32 v0, v6, v0
-; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT: v_add3_u32 v7, v7, v0, s9
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, s8
-; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5]
-; GFX940-NEXT: v_perm_b32 v4, v1, v0, s10
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
+; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7]
+; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX940-NEXT: s_cbranch_execnz .LBB26_1
@@ -8371,45 +8357,47 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s6, s16, 0x400
+; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2
+; GFX11-NEXT: v_mov_b32_e32 v6, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_add_f32_e32 v1, v5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT: v_mov_b32_e32 v5, s6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4
+; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v1, v0, 0x7060302
-; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -8422,41 +8410,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s6, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v0, v3, v0
-; GFX10-NEXT: v_add_f32_e32 v1, v5, v1
-; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4
-; GFX10-NEXT: v_mov_b32_e32 v5, s6
-; GFX10-NEXT: v_perm_b32 v3, v1, v0, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v0, v5
+; GFX10-NEXT: v_mov_b32_e32 v1, v6
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB26_1
@@ -8467,40 +8455,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT: s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v1, v4, v1
-; GFX90A-NEXT: v_add_f32_e32 v0, v6, v0
-; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s9
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v1, v0, s10
-; GFX90A-NEXT: v_mov_b32_e32 v3, s8
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
@@ -8511,41 +8499,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s8, s20, 0x400
+; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: s_movk_i32 s9, 0x7fff
-; GFX908-NEXT: s_mov_b32 s10, 0x7060302
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX908-NEXT: v_add_f32_e32 v0, v6, v0
-; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v3, v3, v1, s9
-; GFX908-NEXT: v_add3_u32 v7, v7, v0, s9
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v3, v1, v0, s10
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s8
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9
+; GFX908-NEXT: v_mov_b32_e32 v0, v5
+; GFX908-NEXT: v_mov_b32_e32 v1, v6
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB26_1
@@ -8556,42 +8544,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s8, s20, 0x400
+; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX8-NEXT: v_add_f32_e32 v0, v6, v0
-; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v3, v1, v0, 16
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s8
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16
+; GFX8-NEXT: v_mov_b32_e32 v0, v5
+; GFX8-NEXT: v_mov_b32_e32 v1, v6
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB26_1
@@ -8603,38 +8591,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v4, 16
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16
+; GFX7-NEXT: v_mov_b32_e32 v6, v1
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB26_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8645,39 +8633,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v1
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v2
; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5
-; GFX6-NEXT: v_alignbit_b32 v0, v0, v4, 16
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v6, s6
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16
+; GFX6-NEXT: v_mov_b32_e32 v6, v1
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB26_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8708,40 +8696,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v1, s16
-; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
-; GFX940-NEXT: s_add_i32 s8, s16, 0x400
+; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT: s_add_i32 s4, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[6:7], 0
-; GFX940-NEXT: s_movk_i32 s9, 0x7fff
-; GFX940-NEXT: s_mov_b32 s10, 0x7060302
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX940-NEXT: s_mov_b32 s9, 0x7060302
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX940-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX940-NEXT: v_add_f32_e32 v2, v5, v4
-; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s9
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT: v_mov_b32_e32 v6, s8
-; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX940-NEXT: v_perm_b32 v2, v2, v1, s10
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
+; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX940-NEXT: s_cbranch_execnz .LBB27_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8751,45 +8739,43 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s16
-; GFX11-NEXT: s_add_i32 s6, s16, 0x400
+; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX11-NEXT: v_add_f32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s4, v1, v1
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX11-NEXT: v_mov_b32_e32 v5, s6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302
-; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v5
; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -8803,39 +8789,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s6, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX10-NEXT: v_add_f32_e32 v3, v5, v4
-; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX10-NEXT: v_mov_b32_e32 v5, s6
-; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v6, v1
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
+; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB27_1
@@ -8847,39 +8833,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT: s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX90A-NEXT: v_add_f32_e32 v2, v5, v4
-; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10
-; GFX90A-NEXT: v_mov_b32_e32 v6, s8
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8890,40 +8876,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s8, s20, 0x400
+; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: s_movk_i32 s9, 0x7fff
-; GFX908-NEXT: s_mov_b32 s10, 0x7060302
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX908-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX908-NEXT: v_add_f32_e32 v3, v5, v4
-; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v6, s8
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX908-NEXT: v_mov_b32_e32 v6, v1
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB27_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8934,41 +8920,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s8, s20, 0x400
+; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX8-NEXT: v_add_f32_e32 v3, v5, v4
-; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v6, s8
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT: v_mov_b32_e32 v6, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB27_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8979,38 +8965,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v0
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB27_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9021,39 +9007,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v0
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v5
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, 16
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v6, s6
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB27_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9106,7 +9092,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4
+; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX940-NEXT: s_mov_b64 s[2:3], exec
; GFX940-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -9118,40 +9104,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
+; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
; GFX940-NEXT: ; implicit-def: $vgpr4
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB28_1
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: s_mov_b64 exec, s[2:3]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5
; GFX940-NEXT: s_movk_i32 s10, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX940-NEXT: s_mov_b32 s11, 0x7060302
; GFX940-NEXT: .LBB28_3: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Loop Header: Depth=1
; GFX940-NEXT: ; Child Loop BB28_4 Depth 2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v9
-; GFX940-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s10
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX940-NEXT: v_add_f32_e32 v4, v4, v9
+; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10
+; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX940-NEXT: s_mov_b64 s[8:9], exec
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
-; GFX940-NEXT: v_add_f32_e32 v6, v7, v6
-; GFX940-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX940-NEXT: v_add3_u32 v7, v7, v6, s10
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v10
+; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10
+; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX940-NEXT: v_perm_b32 v8, v6, v4, s11
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
+; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX940-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -9164,27 +9150,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB28_4
; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
; GFX940-NEXT: s_mov_b64 exec, s[8:9]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v9, v6
+; GFX940-NEXT: v_mov_b32_e32 v7, v4
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB28_3
; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v0, v6
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
@@ -9198,41 +9184,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: ; implicit-def: $vgpr4
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB28_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB28_3: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB28_4 Depth 2
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v8
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v8
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v6, v10, v7
-; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8
+; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo
+; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5
+; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v7, v6, v4, 0x7060302
-; GFX11-NEXT: v_mov_b32_e32 v6, v7
-; GFX11-NEXT: v_mov_b32_e32 v7, v8
+; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
+; GFX11-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-NEXT: v_mov_b32_e32 v5, v6
; GFX11-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
@@ -9246,14 +9233,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB28_4
; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX11-NEXT: v_mov_b32_e32 v8, v6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -9261,14 +9248,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: s_cbranch_execnz .LBB28_3
; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v6
+; GFX11-NEXT: v_mov_b32_e32 v0, v4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
@@ -9280,38 +9268,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
; GFX10-NEXT: ; implicit-def: $vgpr4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB28_1
; GFX10-NEXT: ; %bb.2:
; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX10-NEXT: .LBB28_3: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB28_4 Depth 2
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX10-NEXT: v_add_f32_e32 v6, v10, v7
-; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX10-NEXT: v_add_f32_e32 v4, v4, v8
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v6
-; GFX10-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
-; GFX10-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo
-; GFX10-NEXT: v_perm_b32 v7, v6, v4, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v6, v7
-; GFX10-NEXT: v_mov_b32_e32 v7, v8
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5
+; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v4, v5
+; GFX10-NEXT: v_mov_b32_e32 v5, v6
; GFX10-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
@@ -9323,15 +9311,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB28_4
; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX10-NEXT: v_mov_b32_e32 v8, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
@@ -9340,13 +9328,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX10-NEXT: s_cbranch_execnz .LBB28_3
; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_mov_b32_e32 v0, v6
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4
+; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -9358,38 +9346,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: ; implicit-def: $vgpr4
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5
; GFX90A-NEXT: s_movk_i32 s14, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX90A-NEXT: s_mov_b32 s15, 0x7060302
; GFX90A-NEXT: .LBB28_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB28_4 Depth 2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v9
-; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s14
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
-; GFX90A-NEXT: v_add_f32_e32 v6, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX90A-NEXT: v_add3_u32 v7, v7, v6, s14
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v8, v6, v4, s15
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX90A-NEXT: v_add_f32_e32 v4, v4, v9
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v10
+; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14
+; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -9401,27 +9389,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4
+; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -9433,39 +9421,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
; GFX908-NEXT: ; implicit-def: $vgpr4
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB28_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX908-NEXT: s_movk_i32 s14, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX908-NEXT: s_mov_b32 s15, 0x7060302
; GFX908-NEXT: .LBB28_3: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB28_4 Depth 2
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v8
-; GFX908-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX908-NEXT: v_add3_u32 v6, v6, v4, s14
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX908-NEXT: v_add_f32_e32 v4, v4, v8
+; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
-; GFX908-NEXT: v_add_f32_e32 v6, v7, v6
-; GFX908-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX908-NEXT: v_add3_u32 v7, v7, v6, s14
-; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX908-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc
-; GFX908-NEXT: v_perm_b32 v7, v6, v4, s15
-; GFX908-NEXT: v_mov_b32_e32 v6, v7
+; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14
+; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
-; GFX908-NEXT: v_mov_b32_e32 v7, v8
+; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -9477,27 +9465,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB28_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v8, v6
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB28_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v0, v6
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -9509,40 +9497,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
; GFX8-NEXT: ; implicit-def: $vgpr4
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB28_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX8-NEXT: .LBB28_3: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB28_4 Depth 2
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v8
-; GFX8-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT: v_add_f32_e32 v4, v4, v8
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
-; GFX8-NEXT: v_add_f32_e32 v6, v7, v6
-; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v7, v6, v4, 16
-; GFX8-NEXT: v_mov_b32_e32 v6, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
+; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
-; GFX8-NEXT: v_mov_b32_e32 v7, v8
+; GFX8-NEXT: v_mov_b32_e32 v5, v6
; GFX8-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -9554,21 +9542,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB28_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v8, v6
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB28_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -9742,41 +9730,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: v_mov_b32_e32 v0, s16
; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX940-NEXT: s_add_i32 s8, s16, 0x400
+; GFX940-NEXT: s_add_i32 s4, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[6:7], 0
-; GFX940-NEXT: s_movk_i32 s9, 0x7fff
-; GFX940-NEXT: s_mov_b32 s10, 0x7060302
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX940-NEXT: s_mov_b32 s9, 0x7060302
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX940-NEXT: v_add_f32_e32 v1, v4, v1
-; GFX940-NEXT: v_add_f32_e32 v0, v6, v0
-; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT: v_add3_u32 v7, v7, v0, s9
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, s8
-; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5]
-; GFX940-NEXT: v_perm_b32 v4, v1, v0, s10
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
+; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7]
+; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX940-NEXT: s_cbranch_execnz .LBB29_1
@@ -9787,45 +9775,47 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s6, s16, 0x400
+; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2
+; GFX11-NEXT: v_mov_b32_e32 v6, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_add_f32_e32 v1, v5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT: v_mov_b32_e32 v5, s6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4
+; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v1, v0, 0x7060302
-; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -9838,41 +9828,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s6, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v0, v3, v0
-; GFX10-NEXT: v_add_f32_e32 v1, v5, v1
-; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4
-; GFX10-NEXT: v_mov_b32_e32 v5, s6
-; GFX10-NEXT: v_perm_b32 v3, v1, v0, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v0, v5
+; GFX10-NEXT: v_mov_b32_e32 v1, v6
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB29_1
@@ -9883,40 +9873,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT: s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v1, v4, v1
-; GFX90A-NEXT: v_add_f32_e32 v0, v6, v0
-; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s9
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v1, v0, s10
-; GFX90A-NEXT: v_mov_b32_e32 v3, s8
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB29_1
@@ -9927,41 +9917,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s8, s20, 0x400
+; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: s_movk_i32 s9, 0x7fff
-; GFX908-NEXT: s_mov_b32 s10, 0x7060302
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX908-NEXT: v_add_f32_e32 v0, v6, v0
-; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v3, v3, v1, s9
-; GFX908-NEXT: v_add3_u32 v7, v7, v0, s9
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v3, v1, v0, s10
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s8
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9
+; GFX908-NEXT: v_mov_b32_e32 v0, v5
+; GFX908-NEXT: v_mov_b32_e32 v1, v6
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB29_1
@@ -9972,42 +9962,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s8, s20, 0x400
+; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX8-NEXT: v_add_f32_e32 v0, v6, v0
-; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v3, v1, v0, 16
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s8
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16
+; GFX8-NEXT: v_mov_b32_e32 v0, v5
+; GFX8-NEXT: v_mov_b32_e32 v1, v6
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB29_1
@@ -10019,38 +10009,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v4, 16
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16
+; GFX7-NEXT: v_mov_b32_e32 v6, v1
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB29_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10061,39 +10051,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v1
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v2
; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5
-; GFX6-NEXT: v_alignbit_b32 v0, v0, v4, 16
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v6, s6
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16
+; GFX6-NEXT: v_mov_b32_e32 v6, v1
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB29_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10124,40 +10114,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v1, s16
-; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
-; GFX940-NEXT: s_add_i32 s8, s16, 0x400
+; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT: s_add_i32 s4, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[6:7], 0
-; GFX940-NEXT: s_movk_i32 s9, 0x7fff
-; GFX940-NEXT: s_mov_b32 s10, 0x7060302
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX940-NEXT: s_mov_b32 s9, 0x7060302
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX940-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX940-NEXT: v_add_f32_e32 v2, v5, v4
-; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s9
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT: v_mov_b32_e32 v6, s8
-; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX940-NEXT: v_perm_b32 v2, v2, v1, s10
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
+; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX940-NEXT: s_cbranch_execnz .LBB30_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10167,45 +10157,43 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s16
-; GFX11-NEXT: s_add_i32 s6, s16, 0x400
+; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX11-NEXT: v_add_f32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s4, v1, v1
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX11-NEXT: v_mov_b32_e32 v5, s6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302
-; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v5
; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -10219,39 +10207,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s6, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX10-NEXT: v_add_f32_e32 v3, v5, v4
-; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX10-NEXT: v_mov_b32_e32 v5, s6
-; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v6, v1
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
+; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB30_1
@@ -10263,39 +10251,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT: s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX90A-NEXT: v_add_f32_e32 v2, v5, v4
-; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10
-; GFX90A-NEXT: v_mov_b32_e32 v6, s8
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB30_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10306,40 +10294,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s8, s20, 0x400
+; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: s_movk_i32 s9, 0x7fff
-; GFX908-NEXT: s_mov_b32 s10, 0x7060302
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX908-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX908-NEXT: v_add_f32_e32 v3, v5, v4
-; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v6, s8
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX908-NEXT: v_mov_b32_e32 v6, v1
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB30_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10350,41 +10338,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s8, s20, 0x400
+; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX8-NEXT: v_add_f32_e32 v3, v5, v4
-; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v6, s8
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT: v_mov_b32_e32 v6, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB30_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10395,38 +10383,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v0
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB30_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10437,39 +10425,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v0
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v5
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, 16
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v6, s6
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB30_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10499,41 +10487,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: v_mov_b32_e32 v0, s16
; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX940-NEXT: s_add_i32 s8, s16, 0x400
+; GFX940-NEXT: s_add_i32 s4, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[6:7], 0
-; GFX940-NEXT: s_movk_i32 s9, 0x7fff
-; GFX940-NEXT: s_mov_b32 s10, 0x7060302
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX940-NEXT: s_mov_b32 s9, 0x7060302
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX940-NEXT: v_add_f32_e32 v1, v4, v1
-; GFX940-NEXT: v_add_f32_e32 v0, v6, v0
-; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT: v_add3_u32 v7, v7, v0, s9
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, s8
-; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5]
-; GFX940-NEXT: v_perm_b32 v4, v1, v0, s10
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
+; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7]
+; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX940-NEXT: s_cbranch_execnz .LBB31_1
@@ -10544,45 +10532,47 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s6, s16, 0x400
+; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2
+; GFX11-NEXT: v_mov_b32_e32 v6, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_add_f32_e32 v1, v5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT: v_mov_b32_e32 v5, s6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4
+; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v1, v0, 0x7060302
-; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -10595,41 +10585,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s6, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v0, v3, v0
-; GFX10-NEXT: v_add_f32_e32 v1, v5, v1
-; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4
-; GFX10-NEXT: v_mov_b32_e32 v5, s6
-; GFX10-NEXT: v_perm_b32 v3, v1, v0, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v0, v5
+; GFX10-NEXT: v_mov_b32_e32 v1, v6
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB31_1
@@ -10640,40 +10630,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT: s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v1, v4, v1
-; GFX90A-NEXT: v_add_f32_e32 v0, v6, v0
-; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s9
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v1, v0, s10
-; GFX90A-NEXT: v_mov_b32_e32 v3, s8
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB31_1
@@ -10684,41 +10674,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s8, s20, 0x400
+; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: s_movk_i32 s9, 0x7fff
-; GFX908-NEXT: s_mov_b32 s10, 0x7060302
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX908-NEXT: v_add_f32_e32 v0, v6, v0
-; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v3, v3, v1, s9
-; GFX908-NEXT: v_add3_u32 v7, v7, v0, s9
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v3, v1, v0, s10
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s8
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9
+; GFX908-NEXT: v_mov_b32_e32 v0, v5
+; GFX908-NEXT: v_mov_b32_e32 v1, v6
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB31_1
@@ -10729,42 +10719,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s8, s20, 0x400
+; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX8-NEXT: v_add_f32_e32 v0, v6, v0
-; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v3, v1, v0, 16
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s8
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16
+; GFX8-NEXT: v_mov_b32_e32 v0, v5
+; GFX8-NEXT: v_mov_b32_e32 v1, v6
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB31_1
@@ -10776,38 +10766,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v4, 16
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16
+; GFX7-NEXT: v_mov_b32_e32 v6, v1
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB31_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10818,39 +10808,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v1
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v2
; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5
-; GFX6-NEXT: v_alignbit_b32 v0, v0, v4, 16
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v6, s6
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16
+; GFX6-NEXT: v_mov_b32_e32 v6, v1
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB31_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10881,40 +10871,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v1, s16
-; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
-; GFX940-NEXT: s_add_i32 s8, s16, 0x400
+; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT: s_add_i32 s4, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[6:7], 0
-; GFX940-NEXT: s_movk_i32 s9, 0x7fff
-; GFX940-NEXT: s_mov_b32 s10, 0x7060302
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX940-NEXT: s_mov_b32 s9, 0x7060302
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX940-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX940-NEXT: v_add_f32_e32 v2, v5, v4
-; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s9
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT: v_mov_b32_e32 v6, s8
-; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX940-NEXT: v_perm_b32 v2, v2, v1, s10
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
+; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX940-NEXT: s_cbranch_execnz .LBB32_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10924,45 +10914,43 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s16
-; GFX11-NEXT: s_add_i32 s6, s16, 0x400
+; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX11-NEXT: v_add_f32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s4, v1, v1
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX11-NEXT: v_mov_b32_e32 v5, s6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302
-; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v5
; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -10976,39 +10964,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s6, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX10-NEXT: v_add_f32_e32 v3, v5, v4
-; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX10-NEXT: v_mov_b32_e32 v5, s6
-; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v6, v1
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
+; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB32_1
@@ -11020,39 +11008,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT: s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX90A-NEXT: v_add_f32_e32 v2, v5, v4
-; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10
-; GFX90A-NEXT: v_mov_b32_e32 v6, s8
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB32_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11063,40 +11051,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s8, s20, 0x400
+; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: s_movk_i32 s9, 0x7fff
-; GFX908-NEXT: s_mov_b32 s10, 0x7060302
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX908-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX908-NEXT: v_add_f32_e32 v3, v5, v4
-; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v6, s8
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX908-NEXT: v_mov_b32_e32 v6, v1
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB32_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11107,41 +11095,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s8, s20, 0x400
+; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX8-NEXT: v_add_f32_e32 v3, v5, v4
-; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v6, s8
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT: v_mov_b32_e32 v6, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB32_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11152,38 +11140,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v0
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB32_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11194,39 +11182,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v0
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v5
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, 16
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v6, s6
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB32_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11257,40 +11245,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v1, s16
-; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
-; GFX940-NEXT: s_add_i32 s8, s16, 0x400
+; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT: s_add_i32 s4, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[6:7], 0
-; GFX940-NEXT: s_movk_i32 s9, 0x7fff
-; GFX940-NEXT: s_mov_b32 s10, 0x7060302
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX940-NEXT: s_mov_b32 s9, 0x7060302
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX940-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX940-NEXT: v_add_f32_e32 v2, v5, v4
-; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s9
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT: v_mov_b32_e32 v6, s8
-; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX940-NEXT: v_perm_b32 v2, v2, v1, s10
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
+; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX940-NEXT: s_cbranch_execnz .LBB33_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11300,45 +11288,43 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s16
-; GFX11-NEXT: s_add_i32 s6, s16, 0x400
+; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX11-NEXT: v_add_f32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s4, v1, v1
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX11-NEXT: v_mov_b32_e32 v5, s6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302
-; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v5
; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -11352,39 +11338,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s6, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX10-NEXT: v_add_f32_e32 v3, v5, v4
-; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX10-NEXT: v_mov_b32_e32 v5, s6
-; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v6, v1
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
+; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB33_1
@@ -11396,39 +11382,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT: s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX90A-NEXT: v_add_f32_e32 v2, v5, v4
-; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10
-; GFX90A-NEXT: v_mov_b32_e32 v6, s8
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB33_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11439,40 +11425,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s8, s20, 0x400
+; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: s_movk_i32 s9, 0x7fff
-; GFX908-NEXT: s_mov_b32 s10, 0x7060302
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX908-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX908-NEXT: v_add_f32_e32 v3, v5, v4
-; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v6, s8
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX908-NEXT: v_mov_b32_e32 v6, v1
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB33_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11483,41 +11469,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s8, s20, 0x400
+; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX8-NEXT: v_add_f32_e32 v3, v5, v4
-; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v6, s8
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT: v_mov_b32_e32 v6, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB33_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11528,38 +11514,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v0
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB33_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11570,39 +11556,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v0
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v5
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, 16
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v6, s6
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB33_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11660,23 +11646,23 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB34_1
@@ -11692,12 +11678,12 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
@@ -11720,18 +11706,18 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB34_1
@@ -11747,18 +11733,18 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB34_1
@@ -11774,18 +11760,18 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v3, s6
; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB34_1
@@ -11801,19 +11787,19 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
+; GFX6-NEXT: v_mov_b32_e32 v3, s6
; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v0, v4
+; GFX6-NEXT: v_mov_b32_e32 v1, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB34_1
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index b00f8aecbd2f58..c7511a2df9fe13 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -31,19 +31,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: v_mov_b32_e32 v0, s16
; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s6, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, s6
; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: v_max_f32_e32 v1, v2, v2
; GFX940-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v0, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, s6
+; GFX940-NEXT: v_max_f32_e32 v4, v0, v2
; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
@@ -82,19 +82,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -110,25 +110,25 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v0, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v0, v1
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT: v_max_f32_e32 v4, v0, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB0_1
@@ -145,19 +145,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4
-; GFX8-NEXT: v_max_f32_e32 v3, v0, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT: v_max_f32_e32 v4, v0, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB0_1
@@ -207,24 +207,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v1, s16
-; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s6, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v0, v0
+; GFX940-NEXT: v_mov_b32_e32 v3, s6
; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v1, v0, v0
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v6, s6
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX940-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX940-NEXT: s_cbranch_execnz .LBB1_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -257,23 +257,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v1, v0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, s6
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB1_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -284,24 +284,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v0, v0
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v1, v0, v0
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v1, v3, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB1_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -316,20 +316,20 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1
; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB1_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -400,7 +400,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4
+; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX940-NEXT: s_mov_b64 s[2:3], exec
; GFX940-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -412,22 +412,22 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
+; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
; GFX940-NEXT: ; implicit-def: $vgpr4
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB2_1
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: s_mov_b64 exec, s[2:3]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_max_f32_e32 v9, v5, v5
; GFX940-NEXT: .LBB2_3: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Loop Header: Depth=1
; GFX940-NEXT: ; Child Loop BB2_4 Depth 2
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v6, v9, v9
-; GFX940-NEXT: v_max_f32_e32 v8, v6, v4
+; GFX940-NEXT: v_max_f32_e32 v4, v7, v7
+; GFX940-NEXT: v_max_f32_e32 v6, v4, v9
; GFX940-NEXT: s_mov_b64 s[8:9], exec
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
@@ -441,21 +441,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB2_4
; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
; GFX940-NEXT: s_mov_b64 exec, s[8:9]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v9, v6
+; GFX940-NEXT: v_mov_b32_e32 v7, v4
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB2_3
; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v0, v6
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -520,7 +520,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4
+; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -532,22 +532,22 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: ; implicit-def: $vgpr4
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB2_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_max_f32_e32 v9, v5, v5
; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB2_4 Depth 2
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v6, v9, v9
-; GFX90A-NEXT: v_max_f32_e32 v8, v6, v4
+; GFX90A-NEXT: v_max_f32_e32 v4, v7, v7
+; GFX90A-NEXT: v_max_f32_e32 v6, v4, v9
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -559,27 +559,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB2_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB2_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4
+; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -591,23 +591,23 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
; GFX908-NEXT: ; implicit-def: $vgpr4
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB2_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_max_f32_e32 v8, v5, v5
; GFX908-NEXT: .LBB2_3: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB2_4 Depth 2
-; GFX908-NEXT: v_max_f32_e32 v4, v5, v5
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v6, v8, v8
-; GFX908-NEXT: v_max_f32_e32 v7, v6, v4
-; GFX908-NEXT: v_mov_b32_e32 v6, v7
+; GFX908-NEXT: v_max_f32_e32 v4, v6, v6
+; GFX908-NEXT: v_max_f32_e32 v5, v4, v8
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
-; GFX908-NEXT: v_mov_b32_e32 v7, v8
+; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -619,21 +619,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB2_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v8, v6
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB2_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v0, v6
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -771,19 +771,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: v_mov_b32_e32 v0, s16
; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s6, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, s6
; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: v_max_f32_e32 v1, v2, v2
; GFX940-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v0, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, s6
+; GFX940-NEXT: v_max_f32_e32 v4, v0, v2
; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
@@ -800,28 +800,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x400
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v0
-; GFX11-NEXT: v_max_f32_e32 v0, v2, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v1, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX11-NEXT: v_max_f32_e32 v4, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v0, v3
-; GFX11-NEXT: v_mov_b32_e32 v1, v4
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -833,27 +832,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x400
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_max_f32_e32 v2, v1, v1
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v2, v2
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f32_e32 v1, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v1, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX10-NEXT: v_max_f32_e32 v4, v0, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB3_1
@@ -864,19 +863,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -892,25 +891,25 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v0, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v0, v1
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT: v_max_f32_e32 v4, v0, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB3_1
@@ -927,19 +926,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4
-; GFX8-NEXT: v_max_f32_e32 v3, v0, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT: v_max_f32_e32 v4, v0, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB3_1
@@ -956,19 +955,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX7-NEXT: v_mov_b32_e32 v3, s6
; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4
-; GFX7-NEXT: v_max_f32_e32 v3, v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX7-NEXT: v_max_f32_e32 v4, v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB3_1
@@ -985,20 +984,20 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX6-NEXT: v_mov_b32_e32 v3, s6
; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4
-; GFX6-NEXT: v_max_f32_e32 v3, v0, v2
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX6-NEXT: v_max_f32_e32 v4, v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v0, v4
+; GFX6-NEXT: v_mov_b32_e32 v1, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB3_1
@@ -1029,19 +1028,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: v_mov_b32_e32 v0, s16
; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s6, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, s6
; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: v_max_f32_e32 v1, v2, v2
; GFX940-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v0, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, s6
+; GFX940-NEXT: v_max_f32_e32 v4, v0, v2
; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
@@ -1080,19 +1079,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -1108,25 +1107,25 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v0, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v0, v1
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT: v_max_f32_e32 v4, v0, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB4_1
@@ -1143,19 +1142,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4
-; GFX8-NEXT: v_max_f32_e32 v3, v0, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT: v_max_f32_e32 v4, v0, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB4_1
@@ -1198,29 +1197,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[0:1]
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v10, s5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1243,30 +1243,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x800
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_add_i32 s4, s16, 0x800
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1]
-; GFX11-NEXT: v_mov_b32_e32 v10, s5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1298,29 +1298,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v6, s6
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v9, v1
-; GFX908-NEXT: v_mov_b32_e32 v8, v0
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX908-NEXT: v_mov_b32_e32 v10, s6
-; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX908-NEXT: v_mov_b32_e32 v0, v6
-; GFX908-NEXT: v_mov_b32_e32 v1, v7
-; GFX908-NEXT: v_mov_b32_e32 v2, v8
-; GFX908-NEXT: v_mov_b32_e32 v3, v9
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v10, v1
+; GFX908-NEXT: v_mov_b32_e32 v9, v0
+; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
+; GFX908-NEXT: v_mov_b32_e32 v2, v9
+; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB5_1
@@ -1331,29 +1331,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v10, s6
-; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
-; GFX8-NEXT: v_mov_b32_e32 v1, v7
-; GFX8-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NEXT: v_mov_b32_e32 v3, v9
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v10, v1
+; GFX8-NEXT: v_mov_b32_e32 v9, v0
+; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
+; GFX8-NEXT: v_mov_b32_e32 v2, v9
+; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB5_1
@@ -1393,27 +1393,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, s16
-; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
; GFX12-NEXT: s_mov_b32 s4, 0
-; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[0:1]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[4:5], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[6:7], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5
-; GFX12-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT: v_mov_b32_e32 v6, v2
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1437,28 +1437,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x800
+; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX11-NEXT: s_add_i32 s4, s16, 0x800
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
-; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5]
+; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f64 v[2:3], v[6:7], v[2:3]
-; GFX11-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5
-; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_mov_b32_e32 v6, v2
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1491,27 +1490,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v2, s20
-; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
+; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v6, s6
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v10, s6
-; GFX908-NEXT: v_max_f64 v[2:3], v[6:7], v[2:3]
-; GFX908-NEXT: v_mov_b32_e32 v9, v5
-; GFX908-NEXT: v_mov_b32_e32 v8, v4
-; GFX908-NEXT: v_mov_b32_e32 v7, v3
-; GFX908-NEXT: v_mov_b32_e32 v6, v2
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v10, v3
+; GFX908-NEXT: v_mov_b32_e32 v9, v2
+; GFX908-NEXT: v_mov_b32_e32 v8, v1
+; GFX908-NEXT: v_mov_b32_e32 v7, v0
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v6
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX908-NEXT: v_mov_b32_e32 v2, v7
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v5, v7
+; GFX908-NEXT: v_mov_b32_e32 v3, v8
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB6_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1522,27 +1521,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s20
-; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
+; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v10, s6
-; GFX8-NEXT: v_max_f64 v[2:3], v[6:7], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v9, v5
-; GFX8-NEXT: v_mov_b32_e32 v8, v4
-; GFX8-NEXT: v_mov_b32_e32 v7, v3
-; GFX8-NEXT: v_mov_b32_e32 v6, v2
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v10, v3
+; GFX8-NEXT: v_mov_b32_e32 v9, v2
+; GFX8-NEXT: v_mov_b32_e32 v8, v1
+; GFX8-NEXT: v_mov_b32_e32 v7, v0
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v6
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v2, v7
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v5, v7
+; GFX8-NEXT: v_mov_b32_e32 v3, v8
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB6_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1606,17 +1605,17 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Loop Header: Depth=1
; GFX12-NEXT: ; Child Loop BB7_4 Depth 2
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[5:6], v[5:6]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[13:14], v[13:14]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[2:3], v[0:1]
+; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1
@@ -1710,17 +1709,17 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: s_cbranch_execnz .LBB7_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6]
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB7_4 Depth 2
-; GFX11-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14]
+; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14]
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[11:12], v[2:3], v[0:1]
+; GFX11-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5]
; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1
@@ -1838,15 +1837,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
+; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB7_4 Depth 2
-; GFX908-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14]
+; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14]
; GFX908-NEXT: s_mov_b64 s[12:13], exec
-; GFX908-NEXT: v_max_f64 v[11:12], v[2:3], v[0:1]
+; GFX908-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5]
; GFX908-NEXT: v_mov_b32_e32 v0, v11
; GFX908-NEXT: v_mov_b32_e32 v1, v12
; GFX908-NEXT: v_mov_b32_e32 v2, v13
@@ -1904,15 +1903,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX8-NEXT: s_cbranch_execnz .LBB7_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
+; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB7_4 Depth 2
-; GFX8-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14]
+; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14]
; GFX8-NEXT: s_mov_b64 s[12:13], exec
-; GFX8-NEXT: v_max_f64 v[11:12], v[2:3], v[0:1]
+; GFX8-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5]
; GFX8-NEXT: v_mov_b32_e32 v0, v11
; GFX8-NEXT: v_mov_b32_e32 v1, v12
; GFX8-NEXT: v_mov_b32_e32 v2, v13
@@ -2012,29 +2011,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[0:1]
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v10, s5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2057,30 +2057,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x800
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_add_i32 s4, s16, 0x800
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1]
-; GFX11-NEXT: v_mov_b32_e32 v10, s5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2092,31 +2092,31 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: s_add_i32 s5, s20, 0x800
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-NEXT: s_add_i32 s4, s20, 0x800
+; GFX10-NEXT: v_mov_b32_e32 v6, s4
; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v9, v1
-; GFX10-NEXT: v_mov_b32_e32 v8, v0
-; GFX10-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v10, s5
+; GFX10-NEXT: v_mov_b32_e32 v10, v1
+; GFX10-NEXT: v_mov_b32_e32 v9, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v0, v6
-; GFX10-NEXT: v_mov_b32_e32 v1, v7
-; GFX10-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-NEXT: v_mov_b32_e32 v3, v9
-; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX10-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX10-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v0, v7
+; GFX10-NEXT: v_mov_b32_e32 v1, v8
+; GFX10-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-NEXT: v_mov_b32_e32 v3, v10
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB8_1
@@ -2127,26 +2127,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX90A-NEXT: s_add_i32 s6, s20, 0x800
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX90A-NEXT: v_mov_b32_e32 v6, s6
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX90A-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v10, s6
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX90A-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
@@ -2157,29 +2157,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v6, s6
; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v9, v1
-; GFX908-NEXT: v_mov_b32_e32 v8, v0
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX908-NEXT: v_mov_b32_e32 v10, s6
-; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX908-NEXT: v_mov_b32_e32 v0, v6
-; GFX908-NEXT: v_mov_b32_e32 v1, v7
-; GFX908-NEXT: v_mov_b32_e32 v2, v8
-; GFX908-NEXT: v_mov_b32_e32 v3, v9
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v10, v1
+; GFX908-NEXT: v_mov_b32_e32 v9, v0
+; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
+; GFX908-NEXT: v_mov_b32_e32 v2, v9
+; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB8_1
@@ -2190,29 +2190,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v10, s6
-; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
-; GFX8-NEXT: v_mov_b32_e32 v1, v7
-; GFX8-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NEXT: v_mov_b32_e32 v3, v9
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v10, v1
+; GFX8-NEXT: v_mov_b32_e32 v9, v0
+; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
+; GFX8-NEXT: v_mov_b32_e32 v2, v9
+; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB8_1
@@ -2223,29 +2223,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s20
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX7-NEXT: s_add_i32 s6, s20, 0x800
+; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s6
; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v10, s6
-; GFX7-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, v6
-; GFX7-NEXT: v_mov_b32_e32 v1, v7
-; GFX7-NEXT: v_mov_b32_e32 v2, v8
-; GFX7-NEXT: v_mov_b32_e32 v3, v9
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v10, v1
+; GFX7-NEXT: v_mov_b32_e32 v9, v0
+; GFX7-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX7-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v7
+; GFX7-NEXT: v_mov_b32_e32 v1, v8
+; GFX7-NEXT: v_mov_b32_e32 v2, v9
+; GFX7-NEXT: v_mov_b32_e32 v3, v10
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB8_1
@@ -2256,30 +2256,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
+; GFX6-NEXT: v_mov_b32_e32 v2, v0
; GFX6-NEXT: v_mov_b32_e32 v0, s20
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
+; GFX6-NEXT: v_mov_b32_e32 v3, v1
; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX6-NEXT: s_add_i32 s6, s20, 0x800
+; GFX6-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX6-NEXT: s_mov_b64 s[4:5], 0
+; GFX6-NEXT: v_mov_b32_e32 v6, s6
; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v9, v1
-; GFX6-NEXT: v_mov_b32_e32 v8, v0
+; GFX6-NEXT: v_mov_b32_e32 v10, v1
+; GFX6-NEXT: v_mov_b32_e32 v9, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v10, s6
-; GFX6-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX6-NEXT: v_mov_b32_e32 v0, v6
-; GFX6-NEXT: v_mov_b32_e32 v1, v7
-; GFX6-NEXT: v_mov_b32_e32 v2, v8
-; GFX6-NEXT: v_mov_b32_e32 v3, v9
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX6-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX6-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v0, v7
+; GFX6-NEXT: v_mov_b32_e32 v1, v8
+; GFX6-NEXT: v_mov_b32_e32 v2, v9
+; GFX6-NEXT: v_mov_b32_e32 v3, v10
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB8_1
@@ -2300,29 +2300,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[0:1]
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v10, s5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2345,30 +2346,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x800
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_add_i32 s4, s16, 0x800
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1]
-; GFX11-NEXT: v_mov_b32_e32 v10, s5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2400,29 +2401,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v6, s6
; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v9, v1
-; GFX908-NEXT: v_mov_b32_e32 v8, v0
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX908-NEXT: v_mov_b32_e32 v10, s6
-; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX908-NEXT: v_mov_b32_e32 v0, v6
-; GFX908-NEXT: v_mov_b32_e32 v1, v7
-; GFX908-NEXT: v_mov_b32_e32 v2, v8
-; GFX908-NEXT: v_mov_b32_e32 v3, v9
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v10, v1
+; GFX908-NEXT: v_mov_b32_e32 v9, v0
+; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
+; GFX908-NEXT: v_mov_b32_e32 v2, v9
+; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB9_1
@@ -2433,29 +2434,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v10, s6
-; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
-; GFX8-NEXT: v_mov_b32_e32 v1, v7
-; GFX8-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NEXT: v_mov_b32_e32 v3, v9
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v10, v1
+; GFX8-NEXT: v_mov_b32_e32 v9, v0
+; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
+; GFX8-NEXT: v_mov_b32_e32 v2, v9
+; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB9_1
@@ -2499,47 +2500,47 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_and_b32 s5, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v1, s4
-; GFX12-NEXT: s_lshl_b32 s5, s5, 3
+; GFX12-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-NEXT: s_and_b32 s4, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_not_b32 s7, s6
-; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen
-; GFX12-NEXT: s_mov_b32 s6, 0
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_not_b32 s6, s5
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v1, s5, v2
-; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1
-; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, s5, v1
+; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT: v_mov_b32_e32 v3, v1
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v3
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -2547,263 +2548,256 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s16, 0x200
-; GFX940-NEXT: s_and_b32 s6, s16, -4
-; GFX940-NEXT: v_mov_b32_e32 v1, s6
-; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
+; GFX940-NEXT: s_and_b32 s4, s16, -4
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
+; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen
; GFX940-NEXT: s_and_b32 s4, s16, 3
-; GFX940-NEXT: s_lshl_b32 s7, s4, 3
-; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT: s_not_b32 s8, s4
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_max_f16_e32 v5, v0, v0
; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v1, s7, v3
-; GFX940-NEXT: v_max_f16_e32 v2, v0, v0
-; GFX940-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX940-NEXT: v_max_f16_e32 v1, v1, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX940-NEXT: v_and_or_b32 v2, v3, s8, v1
-; GFX940-NEXT: v_mov_b32_e32 v6, s6
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX940-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX940-NEXT: v_max_f16_e32 v0, v0, v5
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX940-NEXT: s_cbranch_execnz .LBB10_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, s7, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_max_f16_e32 v5, v0, v0
; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_and_b32 s5, s16, 3
-; GFX11-NEXT: v_mov_b32_e32 v1, s4
-; GFX11-NEXT: s_lshl_b32 s5, s5, 3
-; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-NEXT: s_and_b32 s4, s16, 3
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_not_b32 s7, s6
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen
-; GFX11-NEXT: s_mov_b32 s6, 0
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-NEXT: s_not_b32 s6, s5
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, s5, v2
-; GFX11-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX11-NEXT: v_max_f16_e32 v1, v1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, s5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX11-NEXT: v_mov_b32_e32 v3, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT: v_max_f16_e32 v0, v0, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB10_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v3
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
+; GFX10-NEXT: v_max_f16_e32 v5, v0, v0
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: s_and_b32 s5, s20, 3
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: s_lshl_b32 s5, s5, 3
-; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT: s_not_b32 s7, s6
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
-; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: s_and_b32 s4, s20, 3
+; GFX10-NEXT: s_lshl_b32 s4, s4, 3
+; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX10-NEXT: s_not_b32 s6, s5
+; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2
-; GFX10-NEXT: v_max_f16_e32 v3, v0, v0
-; GFX10-NEXT: v_mov_b32_e32 v5, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX10-NEXT: v_max_f16_e32 v1, v1, v3
-; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
+; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT: v_max_f16_e32 v0, v0, v5
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v2
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB10_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v3
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
-; GFX90A-NEXT: s_and_b32 s6, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen
+; GFX90A-NEXT: s_and_b32 s4, s20, -4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
-; GFX90A-NEXT: s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT: s_not_b32 s8, s4
+; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3
-; GFX90A-NEXT: v_max_f16_e32 v2, v0, v0
-; GFX90A-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX90A-NEXT: v_max_f16_e32 v1, v1, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, s6
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX90A-NEXT: v_max_f16_e32 v0, v0, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB10_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
-; GFX908-NEXT: s_and_b32 s6, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v1, s6
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX908-NEXT: s_and_b32 s4, s20, -4
+; GFX908-NEXT: v_mov_b32_e32 v4, s4
+; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
-; GFX908-NEXT: s_lshl_b32 s7, s4, 3
-; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT: s_not_b32 s8, s4
+; GFX908-NEXT: s_lshl_b32 s6, s4, 3
+; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v5, v0, v0
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2
-; GFX908-NEXT: v_max_f16_e32 v3, v0, v0
-; GFX908-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX908-NEXT: v_max_f16_e32 v1, v1, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX908-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX908-NEXT: v_max_f16_e32 v0, v0, v5
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v1, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB10_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v3
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
-; GFX8-NEXT: s_and_b32 s6, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v1, s6
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX8-NEXT: s_and_b32 s4, s20, -4
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
-; GFX8-NEXT: s_lshl_b32 s7, s4, 3
-; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT: s_not_b32 s8, s4
+; GFX8-NEXT: s_lshl_b32 s6, s4, 3
+; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v5, v0, v0
; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2
-; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT: v_max_f16_e32 v1, v1, v3
-; GFX8-NEXT: v_and_b32_e32 v4, s8, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v5
+; GFX8-NEXT: v_and_b32_e32 v2, s7, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB10_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
-; GFX7-NEXT: s_and_b32 s6, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v1, s6
-; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT: s_and_b32 s4, s20, -4
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_and_b32 s4, s20, 3
-; GFX7-NEXT: s_lshl_b32 s7, s4, 3
-; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0
-; GFX7-NEXT: s_not_b32 s8, s4
+; GFX7-NEXT: s_lshl_b32 s6, s4, 3
+; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v2, s8, v1
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_and_b32_e32 v2, s7, v1
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -2813,7 +2807,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX7-NEXT: s_cbranch_execnz .LBB10_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -2821,31 +2815,30 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
-; GFX6-NEXT: s_and_b32 s6, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v1, s6
-; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT: s_and_b32 s4, s20, -4
+; GFX6-NEXT: v_mov_b32_e32 v4, s4
+; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_and_b32 s4, s20, 3
-; GFX6-NEXT: s_lshl_b32 s7, s4, 3
-; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v0
-; GFX6-NEXT: s_not_b32 s8, s4
+; GFX6-NEXT: s_lshl_b32 s6, s4, 3
+; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, s8, v1
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
-; GFX6-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX6-NEXT: v_and_b32_e32 v2, s7, v1
+; GFX6-NEXT: v_max_f32_e32 v0, v0, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
; GFX6-NEXT: v_mov_b32_e32 v3, v1
; GFX6-NEXT: v_mov_b32_e32 v2, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -2855,7 +2848,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX6-NEXT: s_cbranch_execnz .LBB10_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -2873,46 +2866,46 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_and_b32 s5, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v1, s4
-; GFX12-NEXT: s_lshl_b32 s5, s5, 3
+; GFX12-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-NEXT: s_and_b32 s4, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_not_b32 s7, s6
-; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen
-; GFX12-NEXT: s_mov_b32 s6, 0
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_not_b32 s6, s5
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v1, s5, v2
-; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1
-; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, s5, v1
+; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT: v_mov_b32_e32 v3, v1
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -2920,32 +2913,31 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s16, 0x200
-; GFX940-NEXT: s_and_b32 s6, s16, -4
-; GFX940-NEXT: v_mov_b32_e32 v1, s6
-; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
+; GFX940-NEXT: s_and_b32 s4, s16, -4
+; GFX940-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
; GFX940-NEXT: s_and_b32 s4, s16, 3
-; GFX940-NEXT: s_lshl_b32 s7, s4, 3
-; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT: s_not_b32 s8, s4
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_max_f16_e32 v3, v0, v0
; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v1, s7, v3
-; GFX940-NEXT: v_max_f16_e32 v2, v0, v0
-; GFX940-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX940-NEXT: v_max_f16_e32 v1, v1, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX940-NEXT: v_and_or_b32 v2, v3, s8, v1
-; GFX940-NEXT: v_mov_b32_e32 v6, s6
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX940-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX940-NEXT: v_max_f16_e32 v0, v0, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX940-NEXT: s_cbranch_execnz .LBB11_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2956,114 +2948,111 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_max_f16_e32 v3, v0, v0
; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_and_b32 s5, s16, 3
-; GFX11-NEXT: v_mov_b32_e32 v1, s4
-; GFX11-NEXT: s_lshl_b32 s5, s5, 3
-; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-NEXT: s_and_b32 s4, s16, 3
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_not_b32 s7, s6
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen
-; GFX11-NEXT: s_mov_b32 s6, 0
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-NEXT: s_not_b32 s6, s5
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, s5, v2
-; GFX11-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX11-NEXT: v_max_f16_e32 v1, v1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, s5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX11-NEXT: v_mov_b32_e32 v3, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT: v_max_f16_e32 v0, v0, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB11_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
+; GFX10-NEXT: v_max_f16_e32 v3, v0, v0
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: s_and_b32 s5, s20, 3
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: s_lshl_b32 s5, s5, 3
-; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT: s_not_b32 s7, s6
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
-; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-NEXT: s_and_b32 s4, s20, 3
+; GFX10-NEXT: s_lshl_b32 s4, s4, 3
+; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX10-NEXT: s_not_b32 s6, s5
+; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2
-; GFX10-NEXT: v_max_f16_e32 v3, v0, v0
-; GFX10-NEXT: v_mov_b32_e32 v5, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX10-NEXT: v_max_f16_e32 v1, v1, v3
-; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT: v_max_f16_e32 v0, v0, v3
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
+; GFX10-NEXT: v_mov_b32_e32 v4, v0
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB11_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
-; GFX90A-NEXT: s_and_b32 s6, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen
+; GFX90A-NEXT: s_and_b32 s4, s20, -4
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
-; GFX90A-NEXT: s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT: s_not_b32 s8, s4
+; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0
; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3
-; GFX90A-NEXT: v_max_f16_e32 v2, v0, v0
-; GFX90A-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX90A-NEXT: v_max_f16_e32 v1, v1, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, s6
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX90A-NEXT: v_max_f16_e32 v0, v0, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB11_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3074,32 +3063,31 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
-; GFX908-NEXT: s_and_b32 s6, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v1, s6
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX908-NEXT: s_and_b32 s4, s20, -4
+; GFX908-NEXT: v_mov_b32_e32 v2, s4
+; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
-; GFX908-NEXT: s_lshl_b32 s7, s4, 3
-; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT: s_not_b32 s8, s4
+; GFX908-NEXT: s_lshl_b32 s6, s4, 3
+; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v3, v0, v0
; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2
-; GFX908-NEXT: v_max_f16_e32 v3, v0, v0
-; GFX908-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX908-NEXT: v_max_f16_e32 v1, v1, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX908-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX908-NEXT: v_max_f16_e32 v0, v0, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB11_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3110,33 +3098,32 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
-; GFX8-NEXT: s_and_b32 s6, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v1, s6
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX8-NEXT: s_and_b32 s4, s20, -4
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
-; GFX8-NEXT: s_lshl_b32 s7, s4, 3
-; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT: s_not_b32 s8, s4
+; GFX8-NEXT: s_lshl_b32 s6, s4, 3
+; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2
-; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT: v_max_f16_e32 v1, v1, v3
-; GFX8-NEXT: v_and_b32_e32 v4, s8, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v3
+; GFX8-NEXT: v_and_b32_e32 v4, s7, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB11_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3147,35 +3134,34 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
-; GFX7-NEXT: s_and_b32 s6, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v1, s6
-; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT: s_and_b32 s4, s20, -4
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_and_b32 s4, s20, 3
-; GFX7-NEXT: s_lshl_b32 s7, s4, 3
-; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX7-NEXT: s_not_b32 s8, s4
+; GFX7-NEXT: s_lshl_b32 s6, s4, 3
+; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v3, s8, v1
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_and_b32_e32 v4, s7, v1
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_mov_b32_e32 v4, v0
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v3
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB11_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3186,36 +3172,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
-; GFX6-NEXT: s_and_b32 s6, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v1, s6
-; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT: s_and_b32 s4, s20, -4
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_and_b32 s4, s20, 3
-; GFX6-NEXT: s_lshl_b32 s7, s4, 3
-; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-NEXT: s_not_b32 s8, s4
+; GFX6-NEXT: s_lshl_b32 s6, s4, 3
+; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v3, s8, v1
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
-; GFX6-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_and_b32_e32 v4, s7, v1
+; GFX6-NEXT: v_max_f32_e32 v0, v0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
+; GFX6-NEXT: v_mov_b32_e32 v4, v0
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v3
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB11_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3235,15 +3220,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
+; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX12-NEXT: s_mov_b32 s1, exec_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v6
-; GFX12-NEXT: v_and_b32_e32 v10, -4, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX12-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v11, v7
+; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX12-NEXT: v_not_b32_e32 v9, v6
; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
@@ -3258,30 +3243,31 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen
+; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Loop Header: Depth=1
; GFX12-NEXT: ; Child Loop BB12_4 Depth 2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX12-NEXT: v_max_num_f16_e32 v8, v5, v5
+; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v6, v6, v6
-; GFX12-NEXT: v_max_num_f16_e32 v6, v6, v8
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4
+; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v10
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-NEXT: v_mov_b32_e32 v5, v6
; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -3297,15 +3283,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX12-NEXT: v_mov_b32_e32 v7, v8
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -3313,7 +3299,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_cbranch_execnz .LBB12_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -3321,12 +3307,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX940-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX940-NEXT: v_and_b32_e32 v9, -4, v4
; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4
; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v6, v4, s0
-; GFX940-NEXT: v_not_b32_e32 v11, v6
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0
+; GFX940-NEXT: v_not_b32_e32 v10, v4
; GFX940-NEXT: s_mov_b64 s[2:3], exec
; GFX940-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -3338,24 +3324,24 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen
+; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB12_1
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: s_mov_b64 exec, s[2:3]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_max_f16_e32 v11, v5, v5
; GFX940-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Loop Header: Depth=1
; GFX940-NEXT: ; Child Loop BB12_4 Depth 2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX940-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT: v_max_f16_e32 v8, v5, v5
-; GFX940-NEXT: v_max_f16_e32 v6, v6, v8
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX940-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7
+; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX940-NEXT: v_max_f16_e32 v4, v4, v11
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, v8, v4
+; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX940-NEXT: s_mov_b64 s[8:9], exec
-; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
@@ -3369,36 +3355,36 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB12_4
; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX940-NEXT: s_mov_b64 exec, s[8:9]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v7, v8
+; GFX940-NEXT: v_mov_b32_e32 v7, v4
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB12_3
; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v6
-; GFX11-NEXT: v_and_b32_e32 v10, -4, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX11-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v11, v7
+; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX11-NEXT: v_not_b32_e32 v9, v6
; GFX11-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_readfirstlane_b32 s5, v1
@@ -3410,29 +3396,30 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen
+; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB12_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: v_max_f16_e32 v10, v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB12_4 Depth 2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX11-NEXT: v_max_f16_e32 v8, v5, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX11-NEXT: v_max_f16_e32 v6, v6, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX11-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX11-NEXT: v_max_f16_e32 v4, v4, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-NEXT: v_mov_b32_e32 v5, v6
; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
@@ -3446,14 +3433,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB12_4
; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX11-NEXT: v_mov_b32_e32 v7, v8
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -3462,20 +3449,20 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX11-NEXT: s_cbranch_execnz .LBB12_3
; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
-; GFX10-NEXT: v_and_b32_e32 v4, 3, v6
-; GFX10-NEXT: v_and_b32_e32 v10, -4, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v11, v7
+; GFX10-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX10-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v9, v6
; GFX10-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
; GFX10-NEXT: v_readfirstlane_b32 s9, v1
@@ -3485,26 +3472,26 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB12_1
; GFX10-NEXT: ; %bb.2:
; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: v_max_f16_e32 v10, v5, v5
; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB12_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX10-NEXT: v_max_f16_e32 v8, v5, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX10-NEXT: v_max_f16_e32 v6, v6, v8
-; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX10-NEXT: v_mov_b32_e32 v9, v7
-; GFX10-NEXT: v_mov_b32_e32 v8, v6
+; GFX10-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX10-NEXT: v_max_f16_e32 v4, v4, v10
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v5
+; GFX10-NEXT: v_mov_b32_e32 v5, v6
; GFX10-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
@@ -3516,15 +3503,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB12_4
; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX10-NEXT: v_mov_b32_e32 v7, v8
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
@@ -3533,19 +3520,19 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX10-NEXT: s_cbranch_execnz .LBB12_3
; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX90A-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4
; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v6, v4, s4
-; GFX90A-NEXT: v_not_b32_e32 v11, v6
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4
+; GFX90A-NEXT: v_not_b32_e32 v10, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -3557,24 +3544,24 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5
; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT: v_max_f16_e32 v8, v5, v5
-; GFX90A-NEXT: v_max_f16_e32 v6, v6, v8
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v11
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -3586,33 +3573,33 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v7, v8
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX908-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX908-NEXT: v_and_b32_e32 v8, -4, v4
; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4
; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4
-; GFX908-NEXT: v_not_b32_e32 v11, v6
+; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4
+; GFX908-NEXT: v_not_b32_e32 v9, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -3624,25 +3611,25 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB12_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_max_f16_e32 v10, v5, v5
; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB12_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX908-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX908-NEXT: v_max_f16_e32 v8, v5, v5
-; GFX908-NEXT: v_max_f16_e32 v6, v6, v8
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX908-NEXT: v_mov_b32_e32 v9, v7
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX908-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX908-NEXT: v_max_f16_e32 v4, v4, v10
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
-; GFX908-NEXT: v_mov_b32_e32 v8, v6
+; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -3654,33 +3641,33 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB12_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v8
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB12_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4
-; GFX8-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX8-NEXT: v_and_b32_e32 v8, -4, v4
; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4
; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4
-; GFX8-NEXT: v_not_b32_e32 v11, v6
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4
+; GFX8-NEXT: v_not_b32_e32 v9, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -3692,26 +3679,26 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB12_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_max_f16_e32 v10, v5, v5
; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB12_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX8-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX8-NEXT: v_max_f16_e32 v8, v5, v5
-; GFX8-NEXT: v_max_f16_e32 v6, v6, v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX8-NEXT: v_and_b32_e32 v8, v7, v11
-; GFX8-NEXT: v_or_b32_e32 v6, v8, v6
-; GFX8-NEXT: v_mov_b32_e32 v9, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT: v_max_f16_e32 v4, v4, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX8-NEXT: v_and_b32_e32 v5, v6, v9
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
-; GFX8-NEXT: v_mov_b32_e32 v8, v6
+; GFX8-NEXT: v_mov_b32_e32 v5, v6
; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -3723,21 +3710,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB12_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v8
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB12_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -3898,27 +3885,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_and_b32 s5, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v1, s4
-; GFX12-NEXT: s_lshl_b32 s5, s5, 3
+; GFX12-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-NEXT: s_and_b32 s4, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_not_b32 s7, s6
-; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen
-; GFX12-NEXT: s_mov_b32 s6, 0
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_not_b32 s6, s5
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v4
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0
@@ -3928,23 +3916,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, s5, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v2
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -3952,33 +3940,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s16, 0x200
-; GFX940-NEXT: s_and_b32 s6, s16, -4
-; GFX940-NEXT: v_mov_b32_e32 v1, s6
-; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX940-NEXT: s_and_b32 s4, s16, -4
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
+; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen
; GFX940-NEXT: s_and_b32 s4, s16, 3
-; GFX940-NEXT: s_lshl_b32 s7, s4, 3
-; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT: s_not_b32 s8, s4
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX940-NEXT: s_movk_i32 s9, 0x7fff
+; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: v_mov_b32_e32 v5, s6
-; GFX940-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_max_f32_e32 v0, v0, v5
; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v2, v2, v0, s9
+; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v0, v1, s8, v0
+; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
-; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -3988,32 +3975,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX940-NEXT: s_cbranch_execnz .LBB13_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_and_b32 s5, s16, 3
-; GFX11-NEXT: v_mov_b32_e32 v1, s4
-; GFX11-NEXT: s_lshl_b32 s5, s5, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5
-; GFX11-NEXT: s_not_b32 s7, s6
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen
-; GFX11-NEXT: s_mov_b32 s6, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-NEXT: s_and_b32 s4, s16, 3
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-NEXT: s_not_b32 s6, s5
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_max_f32_e32 v0, v0, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0
@@ -4023,97 +4011,95 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, s5, v0
-; GFX11-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB13_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: s_and_b32 s5, s20, 3
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: s_lshl_b32 s5, s5, 3
-; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT: s_not_b32 s7, s6
-; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
-; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: s_and_b32 s4, s20, 3
+; GFX10-NEXT: s_lshl_b32 s4, s4, 3
+; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX10-NEXT: s_not_b32 s6, s5
+; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_mov_b32_e32 v5, s4
+; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v5
; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX10-NEXT: v_mov_b32_e32 v3, v1
; GFX10-NEXT: v_mov_b32_e32 v2, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v2
-; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB13_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v2
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
-; GFX90A-NEXT: s_and_b32 s6, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX90A-NEXT: s_and_b32 s4, s20, -4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
-; GFX90A-NEXT: s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT: s_not_b32 s8, s4
+; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX90A-NEXT: s_movk_i32 s9, 0x7fff
+; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_max_f32_e32 v0, v0, v5
; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s9
+; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, s6
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4123,39 +4109,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
-; GFX908-NEXT: s_and_b32 s6, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v1, s6
-; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX908-NEXT: s_and_b32 s4, s20, -4
+; GFX908-NEXT: v_mov_b32_e32 v4, s4
+; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
-; GFX908-NEXT: s_lshl_b32 s7, s4, 3
-; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT: s_not_b32 s8, s4
+; GFX908-NEXT: s_lshl_b32 s6, s4, 3
+; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX908-NEXT: s_movk_i32 s9, 0x7fff
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_max_f32_e32 v0, v0, v5
; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v2, v2, v0, s9
+; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0
+; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
; GFX908-NEXT: v_mov_b32_e32 v2, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4165,41 +4150,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
-; GFX8-NEXT: s_and_b32 s6, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v1, s6
-; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX8-NEXT: s_and_b32 s4, s20, -4
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
-; GFX8-NEXT: s_lshl_b32 s7, s4, 3
-; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT: s_not_b32 s8, s4
+; GFX8-NEXT: s_lshl_b32 s6, s4, 3
+; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_max_f32_e32 v3, v3, v5
; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v2, s8, v1
+; GFX8-NEXT: v_and_b32_e32 v2, s7, v1
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4209,38 +4193,37 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
-; GFX7-NEXT: s_and_b32 s6, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v1, s6
-; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT: s_and_b32 s4, s20, -4
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX7-NEXT: s_and_b32 s4, s20, 3
-; GFX7-NEXT: s_lshl_b32 s7, s4, 3
-; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7
+; GFX7-NEXT: s_lshl_b32 s6, s4, 3
+; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: s_not_b32 s8, s4
+; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v5
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v2, s8, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0
+; GFX7-NEXT: v_and_b32_e32 v2, s7, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4250,7 +4233,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX7-NEXT: s_cbranch_execnz .LBB13_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -4258,32 +4241,31 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
-; GFX6-NEXT: s_and_b32 s6, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v1, s6
-; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT: s_and_b32 s4, s20, -4
+; GFX6-NEXT: v_mov_b32_e32 v4, s4
+; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX6-NEXT: s_and_b32 s4, s20, 3
-; GFX6-NEXT: s_lshl_b32 s7, s4, 3
-; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7
+; GFX6-NEXT: s_lshl_b32 s6, s4, 3
+; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: s_not_b32 s8, s4
+; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX6-NEXT: v_max_f32_e32 v0, v0, v5
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, s8, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0
+; GFX6-NEXT: v_and_b32_e32 v2, s7, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
; GFX6-NEXT: v_mov_b32_e32 v2, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4293,7 +4275,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX6-NEXT: s_cbranch_execnz .LBB13_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -4311,52 +4293,53 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_and_b32 s5, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v1, s4
-; GFX12-NEXT: s_lshl_b32 s5, s5, 3
+; GFX12-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-NEXT: s_and_b32 s4, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_not_b32 s7, s6
-; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen
-; GFX12-NEXT: s_mov_b32 s6, 0
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_not_b32 s6, s5
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, s5, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v3
-; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -4364,33 +4347,32 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s16, 0x200
-; GFX940-NEXT: s_and_b32 s6, s16, -4
-; GFX940-NEXT: v_mov_b32_e32 v1, s6
-; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX940-NEXT: s_and_b32 s4, s16, -4
+; GFX940-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
; GFX940-NEXT: s_and_b32 s4, s16, 3
-; GFX940-NEXT: s_lshl_b32 s7, s4, 3
-; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT: s_not_b32 s8, s4
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX940-NEXT: s_movk_i32 s9, 0x7fff
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: v_mov_b32_e32 v3, s6
-; GFX940-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_max_f32_e32 v0, v0, v3
; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v4, v4, v0, s9
+; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v0, v1, s8, v0
+; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
@@ -4406,123 +4388,122 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_and_b32 s5, s16, 3
-; GFX11-NEXT: v_mov_b32_e32 v1, s4
-; GFX11-NEXT: s_lshl_b32 s5, s5, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5
-; GFX11-NEXT: s_not_b32 s7, s6
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen
-; GFX11-NEXT: s_mov_b32 s6, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-NEXT: s_and_b32 s4, s16, 3
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-NEXT: s_not_b32 s6, s5
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_max_f32_e32 v0, v0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, s5, v0
-; GFX11-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v3
-; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB14_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: s_and_b32 s5, s20, 3
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: s_lshl_b32 s5, s5, 3
-; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT: s_not_b32 s7, s6
-; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
-; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-NEXT: s_and_b32 s4, s20, 3
+; GFX10-NEXT: s_lshl_b32 s4, s4, 3
+; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX10-NEXT: s_not_b32 s6, s5
+; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_mov_b32_e32 v5, s4
+; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: v_mov_b32_e32 v3, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
+; GFX10-NEXT: v_mov_b32_e32 v4, v0
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v3
-; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB14_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
-; GFX90A-NEXT: s_and_b32 s6, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX90A-NEXT: s_and_b32 s4, s20, -4
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
-; GFX90A-NEXT: s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT: s_not_b32 s8, s4
+; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX90A-NEXT: s_movk_i32 s9, 0x7fff
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3
; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s9
+; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
@@ -4538,37 +4519,36 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
-; GFX908-NEXT: s_and_b32 s6, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v1, s6
-; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX908-NEXT: s_and_b32 s4, s20, -4
+; GFX908-NEXT: v_mov_b32_e32 v2, s4
+; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
-; GFX908-NEXT: s_lshl_b32 s7, s4, 3
-; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT: s_not_b32 s8, s4
+; GFX908-NEXT: s_lshl_b32 s6, s4, 3
+; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX908-NEXT: s_movk_i32 s9, 0x7fff
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX908-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v3, v3, v0, s9
+; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0
-; GFX908-NEXT: v_mov_b32_e32 v4, v1
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v3, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v3
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB14_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4579,39 +4559,38 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
-; GFX8-NEXT: s_and_b32 s6, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v1, s6
-; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX8-NEXT: s_and_b32 s4, s20, -4
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
-; GFX8-NEXT: s_lshl_b32 s7, s4, 3
-; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT: s_not_b32 s8, s4
+; GFX8-NEXT: s_lshl_b32 s6, s4, 3
+; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f32_e32 v4, v4, v2
-; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f32_e32 v5, v5, v3
+; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v3, s8, v1
-; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, s7, v1
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB14_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4622,36 +4601,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
-; GFX7-NEXT: s_and_b32 s6, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v1, s6
-; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT: s_and_b32 s4, s20, -4
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX7-NEXT: s_and_b32 s4, s20, 3
-; GFX7-NEXT: s_lshl_b32 s7, s4, 3
-; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7
+; GFX7-NEXT: s_lshl_b32 s6, s4, 3
+; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: s_not_b32 s8, s4
+; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v3, s8, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: v_and_b32_e32 v4, s7, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_mov_b32_e32 v4, v0
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v3
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB14_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4662,37 +4640,36 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
-; GFX6-NEXT: s_and_b32 s6, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v1, s6
-; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT: s_and_b32 s4, s20, -4
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX6-NEXT: s_and_b32 s4, s20, 3
-; GFX6-NEXT: s_lshl_b32 s7, s4, 3
-; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7
+; GFX6-NEXT: s_lshl_b32 s6, s4, 3
+; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: s_not_b32 s8, s4
+; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_max_f32_e32 v0, v0, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v3, s8, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: v_and_b32_e32 v4, s7, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
+; GFX6-NEXT: v_mov_b32_e32 v4, v0
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v3
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB14_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5419,27 +5396,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v0
-; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s5, s16, 0x400
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v3, s4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1
; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
+; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5
-; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v2
+; GFX12-NEXT: v_mov_b32_e32 v5, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v1, v4, v4
-; GFX12-NEXT: v_pk_max_num_f16 v3, v1, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5
+; GFX12-NEXT: v_pk_max_num_f16 v4, v0, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -5452,20 +5431,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: v_mov_b32_e32 v0, s16
; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s6, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_pk_max_f16 v2, v1, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, s6
; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: v_pk_max_f16 v1, v2, v2
; GFX940-NEXT: v_pk_max_f16 v0, v5, v5
-; GFX940-NEXT: v_mov_b32_e32 v3, s6
-; GFX940-NEXT: v_pk_max_f16 v4, v0, v1
; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_pk_max_f16 v4, v0, v2
+; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -5481,27 +5461,28 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x400
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-NEXT: v_pk_max_f16 v2, v1, v1
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT: v_pk_max_f16 v0, v2, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v1, v4, v4
-; GFX11-NEXT: v_pk_max_f16 v3, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v0, v5, v5
+; GFX11-NEXT: v_pk_max_f16 v4, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -5513,27 +5494,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x400
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_pk_max_f16 v2, v1, v1
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: v_pk_max_f16 v0, v2, v2
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_pk_max_f16 v1, v4, v4
-; GFX10-NEXT: v_pk_max_f16 v3, v1, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_pk_max_f16 v0, v5, v5
+; GFX10-NEXT: v_pk_max_f16 v4, v0, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB16_1
@@ -5544,19 +5525,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2
; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_pk_max_f16 v4, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -5572,25 +5553,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v2, v1, v1
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_pk_max_f16 v1, v2, v2
-; GFX908-NEXT: v_pk_max_f16 v0, v4, v4
-; GFX908-NEXT: v_pk_max_f16 v3, v0, v1
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_pk_max_f16 v0, v5, v5
+; GFX908-NEXT: v_pk_max_f16 v4, v0, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB16_1
@@ -5601,29 +5582,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v3, v1, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v4, v4
-; GFX8-NEXT: v_max_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v0, v6, v0
-; GFX8-NEXT: v_or_b32_e32 v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v1, v6, v6
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v3
+; GFX8-NEXT: v_or_b32_e32 v5, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, v5
+; GFX8-NEXT: v_mov_b32_e32 v1, v6
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB16_1
@@ -5646,30 +5627,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_mov_b32_e32 v8, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_max_f32_e32 v4, v4, v2
-; GFX7-NEXT: v_max_f32_e32 v5, v5, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5
-; GFX7-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT: v_or_b32_e32 v4, v6, v0
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v0
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB16_1
@@ -5692,31 +5673,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_mov_b32_e32 v8, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_max_f32_e32 v4, v4, v2
-; GFX6-NEXT: v_max_f32_e32 v5, v5, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX6-NEXT: v_max_f32_e32 v6, v6, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5
-; GFX6-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX6-NEXT: v_or_b32_e32 v4, v6, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6
+; GFX6-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX6-NEXT: v_or_b32_e32 v5, v7, v0
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB16_1
@@ -5738,25 +5719,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s16
-; GFX12-NEXT: s_add_co_i32 s5, s16, 0x400
+; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
+; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v3, s4
; GFX12-NEXT: s_mov_b32 s4, 0
-; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v1, v0, v0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
+; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_pk_max_num_f16 v1, v3, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT: v_mov_b32_e32 v3, v1
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2
+; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -5770,24 +5752,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v1, s16
-; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s6, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX940-NEXT: v_mov_b32_e32 v3, s6
; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v1, v0, v0
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX940-NEXT: v_mov_b32_e32 v6, s6
-; GFX940-NEXT: v_pk_max_f16 v2, v2, v1
+; GFX940-NEXT: v_pk_max_f16 v0, v1, v1
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX940-NEXT: s_cbranch_execnz .LBB17_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5798,25 +5781,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x400
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX11-NEXT: v_mov_b32_e32 v3, s4
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v1, v0, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX11-NEXT: v_pk_max_f16 v0, v1, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_pk_max_f16 v1, v3, v1
-; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2
-; GFX11-NEXT: v_mov_b32_e32 v3, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v4
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -5829,25 +5812,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v1, v0, v0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_pk_max_f16 v0, v1, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_pk_max_f16 v1, v3, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
+; GFX10-NEXT: v_mov_b32_e32 v4, v0
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB17_1
@@ -5859,23 +5842,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v1, v0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, s6
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5886,24 +5869,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v1, v0, v0
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX908-NEXT: v_pk_max_f16 v1, v3, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX908-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB17_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5914,28 +5897,28 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v4, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v3, v5, v4
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v1, v1
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v3
+; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT: v_mov_b32_e32 v6, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB17_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5946,41 +5929,41 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_mov_b32_e32 v7, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_max_f32_e32 v4, v4, v0
-; GFX7-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_or_b32_e32 v4, v2, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v0
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_or_b32_e32 v5, v3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v6, v3
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB17_1
@@ -5992,42 +5975,42 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_mov_b32_e32 v7, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_max_f32_e32 v4, v4, v0
-; GFX6-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX6-NEXT: v_or_b32_e32 v4, v2, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_max_f32_e32 v5, v5, v0
+; GFX6-NEXT: v_max_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT: v_or_b32_e32 v5, v3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v6, v3
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB17_1
@@ -6048,7 +6031,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
+; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX12-NEXT: s_mov_b32 s1, exec_lo
; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -6064,26 +6047,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024
+; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Loop Header: Depth=1
; GFX12-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX12-NEXT: v_pk_max_num_f16 v4, v5, v5
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v6, v8, v8
+; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v7, v6, v4
-; GFX12-NEXT: v_mov_b32_e32 v6, v7
-; GFX12-NEXT: v_mov_b32_e32 v7, v8
+; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v8
+; GFX12-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-NEXT: v_mov_b32_e32 v5, v6
; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -6099,15 +6082,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX12-NEXT: v_mov_b32_e32 v8, v6
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6115,14 +6098,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_cbranch_execnz .LBB18_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v6
+; GFX12-NEXT: v_mov_b32_e32 v0, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4
+; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX940-NEXT: s_mov_b64 s[2:3], exec
; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -6134,23 +6117,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
+; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
; GFX940-NEXT: ; implicit-def: $vgpr4
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB18_1
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: s_mov_b64 exec, s[2:3]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_pk_max_f16 v9, v5, v5
; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Loop Header: Depth=1
; GFX940-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX940-NEXT: v_pk_max_f16 v4, v5, v5
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v6, v9, v9
+; GFX940-NEXT: v_pk_max_f16 v4, v7, v7
; GFX940-NEXT: s_mov_b64 s[8:9], exec
-; GFX940-NEXT: v_pk_max_f16 v8, v6, v4
+; GFX940-NEXT: v_pk_max_f16 v6, v4, v9
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -6163,27 +6146,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB18_4
; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX940-NEXT: s_mov_b64 exec, s[8:9]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v9, v6
+; GFX940-NEXT: v_mov_b32_e32 v7, v4
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB18_3
; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v0, v6
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
@@ -6197,25 +6180,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: ; implicit-def: $vgpr4
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB18_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: v_pk_max_f16 v8, v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX11-NEXT: v_pk_max_f16 v4, v5, v5
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v6, v8, v8
+; GFX11-NEXT: v_pk_max_f16 v4, v6, v6
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v7, v6, v4
-; GFX11-NEXT: v_mov_b32_e32 v6, v7
-; GFX11-NEXT: v_mov_b32_e32 v7, v8
+; GFX11-NEXT: v_pk_max_f16 v5, v4, v8
+; GFX11-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-NEXT: v_mov_b32_e32 v5, v6
; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
@@ -6229,14 +6212,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB18_4
; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX11-NEXT: v_mov_b32_e32 v8, v6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -6245,13 +6228,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_cbranch_execnz .LBB18_3
; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v6
+; GFX11-NEXT: v_mov_b32_e32 v0, v4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
@@ -6263,24 +6246,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
; GFX10-NEXT: ; implicit-def: $vgpr4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB18_1
; GFX10-NEXT: ; %bb.2:
; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: v_pk_max_f16 v8, v5, v5
; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX10-NEXT: v_pk_max_f16 v4, v5, v5
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v6, v8, v8
+; GFX10-NEXT: v_pk_max_f16 v4, v6, v6
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_pk_max_f16 v7, v6, v4
-; GFX10-NEXT: v_mov_b32_e32 v6, v7
-; GFX10-NEXT: v_mov_b32_e32 v7, v8
+; GFX10-NEXT: v_pk_max_f16 v5, v4, v8
+; GFX10-NEXT: v_mov_b32_e32 v4, v5
+; GFX10-NEXT: v_mov_b32_e32 v5, v6
; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
@@ -6292,15 +6275,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB18_4
; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX10-NEXT: v_mov_b32_e32 v8, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
@@ -6309,13 +6292,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX10-NEXT: s_cbranch_execnz .LBB18_3
; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_mov_b32_e32 v0, v6
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4
+; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -6327,22 +6310,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: ; implicit-def: $vgpr4
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5
; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v6, v9, v9
-; GFX90A-NEXT: v_pk_max_f16 v8, v6, v4
+; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7
+; GFX90A-NEXT: v_pk_max_f16 v6, v4, v9
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -6354,27 +6337,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4
+; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -6386,23 +6369,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
; GFX908-NEXT: ; implicit-def: $vgpr4
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB18_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_pk_max_f16 v8, v5, v5
; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX908-NEXT: v_pk_max_f16 v4, v5, v5
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v6, v8, v8
-; GFX908-NEXT: v_pk_max_f16 v7, v6, v4
-; GFX908-NEXT: v_mov_b32_e32 v6, v7
+; GFX908-NEXT: v_pk_max_f16 v4, v6, v6
+; GFX908-NEXT: v_pk_max_f16 v5, v4, v8
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
-; GFX908-NEXT: v_mov_b32_e32 v7, v8
+; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -6414,27 +6397,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB18_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v8, v6
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB18_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v0, v6
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -6446,27 +6429,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
; GFX8-NEXT: ; implicit-def: $vgpr4
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB18_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v9, v5, v5
; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX8-NEXT: v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v6, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v6, v5, v5
-; GFX8-NEXT: v_max_f16_e32 v7, v8, v8
-; GFX8-NEXT: v_max_f16_e32 v6, v7, v6
-; GFX8-NEXT: v_or_b32_e32 v7, v6, v4
-; GFX8-NEXT: v_mov_b32_e32 v6, v7
+; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT: v_max_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v9
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
-; GFX8-NEXT: v_mov_b32_e32 v7, v8
+; GFX8-NEXT: v_mov_b32_e32 v5, v6
; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -6478,21 +6461,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB18_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v8, v6
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB18_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -6669,43 +6652,45 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v0
-; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s6, s16, 0x400
+; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2
+; GFX12-NEXT: v_mov_b32_e32 v6, v0
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-NEXT: v_max_num_f32_e32 v1, v5, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX12-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_max_num_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7
-; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v5, s6
-; GFX12-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4
-; GFX12-NEXT: v_perm_b32 v3, v1, v0, 0x7060302
+; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -6718,41 +6703,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: v_mov_b32_e32 v0, s16
; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX940-NEXT: s_add_i32 s8, s16, 0x400
+; GFX940-NEXT: s_add_i32 s4, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[6:7], 0
-; GFX940-NEXT: s_movk_i32 s9, 0x7fff
-; GFX940-NEXT: s_mov_b32 s10, 0x7060302
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX940-NEXT: s_mov_b32 s9, 0x7060302
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX940-NEXT: v_max_f32_e32 v1, v4, v1
-; GFX940-NEXT: v_max_f32_e32 v0, v6, v0
-; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT: v_add3_u32 v7, v7, v0, s9
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, s8
-; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5]
-; GFX940-NEXT: v_perm_b32 v4, v1, v0, s10
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
+; GFX940-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7]
+; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX940-NEXT: s_cbranch_execnz .LBB19_1
@@ -6763,45 +6748,47 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s6, s16, 0x400
+; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2
+; GFX11-NEXT: v_mov_b32_e32 v6, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT: v_max_f32_e32 v1, v1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_max_f32_e32 v1, v5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_max_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT: v_mov_b32_e32 v5, s6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4
+; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v1, v0, 0x7060302
-; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -6814,41 +6801,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s6, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX10-NEXT: v_max_f32_e32 v0, v3, v0
-; GFX10-NEXT: v_max_f32_e32 v1, v5, v1
-; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4
-; GFX10-NEXT: v_mov_b32_e32 v5, s6
-; GFX10-NEXT: v_perm_b32 v3, v1, v0, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v0, v5
+; GFX10-NEXT: v_mov_b32_e32 v1, v6
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB19_1
@@ -6859,40 +6846,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT: s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_max_f32_e32 v1, v4, v1
-; GFX90A-NEXT: v_max_f32_e32 v0, v6, v0
-; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s9
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v1, v0, s10
-; GFX90A-NEXT: v_mov_b32_e32 v3, s8
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
+; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB19_1
@@ -6903,41 +6890,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s8, s20, 0x400
+; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: s_movk_i32 s9, 0x7fff
-; GFX908-NEXT: s_mov_b32 s10, 0x7060302
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT: v_max_f32_e32 v1, v3, v1
-; GFX908-NEXT: v_max_f32_e32 v0, v6, v0
-; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v3, v3, v1, s9
-; GFX908-NEXT: v_add3_u32 v7, v7, v0, s9
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v3, v1, v0, s10
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s8
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX908-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX908-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9
+; GFX908-NEXT: v_mov_b32_e32 v0, v5
+; GFX908-NEXT: v_mov_b32_e32 v1, v6
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB19_1
@@ -6948,42 +6935,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s8, s20, 0x400
+; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT: v_max_f32_e32 v1, v3, v1
-; GFX8-NEXT: v_max_f32_e32 v0, v6, v0
-; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v3, v1, v0, 16
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s8
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16
+; GFX8-NEXT: v_mov_b32_e32 v0, v5
+; GFX8-NEXT: v_mov_b32_e32 v1, v6
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB19_1
@@ -6995,38 +6982,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_max_f32_e32 v4, v4, v2
-; GFX7-NEXT: v_max_f32_e32 v5, v5, v3
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v3
; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v4
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT: v_mov_b32_e32 v6, v1
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB19_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7037,39 +7024,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_max_f32_e32 v4, v4, v2
-; GFX6-NEXT: v_max_f32_e32 v5, v5, v3
+; GFX6-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX6-NEXT: v_max_f32_e32 v6, v6, v3
; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v4
-; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v6, s6
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX6-NEXT: v_mov_b32_e32 v6, v1
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB19_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7089,43 +7076,41 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v1, s16
-; GFX12-NEXT: s_add_co_i32 s6, s16, 0x400
+; GFX12-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v3 :: v_dual_max_num_f32 v0, v0, v2
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v1, v3, v1
-; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s4, v1, v1
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v5, s6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v1, v3, v1, 0x7060302
-; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX12-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -7139,40 +7124,40 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v1, s16
-; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
-; GFX940-NEXT: s_add_i32 s8, s16, 0x400
+; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT: s_add_i32 s4, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[6:7], 0
-; GFX940-NEXT: s_movk_i32 s9, 0x7fff
-; GFX940-NEXT: s_mov_b32 s10, 0x7060302
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX940-NEXT: s_mov_b32 s9, 0x7060302
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX940-NEXT: v_max_f32_e32 v1, v2, v1
-; GFX940-NEXT: v_max_f32_e32 v2, v5, v4
-; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s9
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT: v_mov_b32_e32 v6, s8
-; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX940-NEXT: v_perm_b32 v2, v2, v1, s10
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX940-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_max_f32_e32 v5, v5, v3
+; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
+; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX940-NEXT: s_cbranch_execnz .LBB20_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7182,45 +7167,43 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s16
-; GFX11-NEXT: s_add_i32 s6, s16, 0x400
+; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_max_f32 v5, v5, v3 :: v_dual_max_f32 v0, v0, v2
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v1, v3, v1
-; GFX11-NEXT: v_max_f32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s4, v1, v1
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX11-NEXT: v_mov_b32_e32 v5, s6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302
-; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v5
; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -7234,39 +7217,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s6, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f32_e32 v1, v3, v1
-; GFX10-NEXT: v_max_f32_e32 v3, v5, v4
-; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX10-NEXT: v_mov_b32_e32 v5, s6
-; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v3
+; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v6, v1
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
+; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB20_1
@@ -7278,39 +7261,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT: s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT: v_max_f32_e32 v1, v2, v1
-; GFX90A-NEXT: v_max_f32_e32 v2, v5, v4
-; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10
-; GFX90A-NEXT: v_mov_b32_e32 v6, s8
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_max_f32_e32 v5, v5, v3
+; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7321,40 +7304,40 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s8, s20, 0x400
+; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: s_movk_i32 s9, 0x7fff
-; GFX908-NEXT: s_mov_b32 s10, 0x7060302
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX908-NEXT: v_max_f32_e32 v1, v3, v1
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v4
-; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v6, s8
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX908-NEXT: v_max_f32_e32 v5, v5, v3
+; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX908-NEXT: v_mov_b32_e32 v6, v1
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB20_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7365,41 +7348,41 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s8, s20, 0x400
+; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT: v_max_f32_e32 v1, v3, v1
-; GFX8-NEXT: v_max_f32_e32 v3, v5, v4
-; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v6, s8
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_max_f32_e32 v5, v5, v3
+; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT: v_mov_b32_e32 v6, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB20_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7411,37 +7394,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_max_f32_e32 v4, v4, v0
-; GFX7-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v5, 16
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v0
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_alignbit_b32 v4, v4, v3, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB20_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7453,38 +7436,38 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_max_f32_e32 v4, v4, v0
-; GFX6-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX6-NEXT: v_alignbit_b32 v3, v3, v2, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v5, 16
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_max_f32_e32 v5, v5, v0
+; GFX6-NEXT: v_max_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_alignbit_b32 v4, v4, v3, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v6, s6
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB20_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7504,7 +7487,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
+; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX12-NEXT: s_mov_b32 s1, exec_lo
; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -7520,43 +7503,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024
+; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Loop Header: Depth=1
; GFX12-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v8
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX12-NEXT: v_and_b32_e32 v10, 0xffff0000, v8
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v4, v6, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v6, v10, v7
-; GFX12-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v9 :: v_dual_max_num_f32 v4, v4, v8
+; GFX12-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v4
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
-; GFX12-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo
+; GFX12-NEXT: v_or_b32_e32 v13, 0x400000, v5
+; GFX12-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
+; GFX12-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v7, v6, v4, 0x7060302
-; GFX12-NEXT: v_mov_b32_e32 v6, v7
-; GFX12-NEXT: v_mov_b32_e32 v7, v8
+; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
+; GFX12-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-NEXT: v_mov_b32_e32 v5, v6
; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -7572,15 +7554,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX12-NEXT: v_mov_b32_e32 v8, v6
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -7588,14 +7570,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_cbranch_execnz .LBB21_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v6
+; GFX12-NEXT: v_mov_b32_e32 v0, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4
+; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX940-NEXT: s_mov_b64 s[2:3], exec
; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -7607,40 +7589,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
+; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
; GFX940-NEXT: ; implicit-def: $vgpr4
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB21_1
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: s_mov_b64 exec, s[2:3]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5
; GFX940-NEXT: s_movk_i32 s10, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX940-NEXT: s_mov_b32 s11, 0x7060302
; GFX940-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Loop Header: Depth=1
; GFX940-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v9
-; GFX940-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s10
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX940-NEXT: v_max_f32_e32 v4, v4, v9
+; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10
+; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX940-NEXT: s_mov_b64 s[8:9], exec
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
-; GFX940-NEXT: v_max_f32_e32 v6, v7, v6
-; GFX940-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX940-NEXT: v_add3_u32 v7, v7, v6, s10
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_max_f32_e32 v5, v5, v10
+; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10
+; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX940-NEXT: v_perm_b32 v8, v6, v4, s11
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
+; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX940-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -7653,27 +7635,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB21_4
; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX940-NEXT: s_mov_b64 exec, s[8:9]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v9, v6
+; GFX940-NEXT: v_mov_b32_e32 v7, v4
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB21_3
; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v0, v6
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -7687,41 +7669,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: ; implicit-def: $vgpr4
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB21_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v8
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v8
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v6, v10, v7
-; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_max_f32 v5, v5, v9 :: v_dual_max_f32 v4, v4, v8
+; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo
+; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5
+; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v7, v6, v4, 0x7060302
-; GFX11-NEXT: v_mov_b32_e32 v6, v7
-; GFX11-NEXT: v_mov_b32_e32 v7, v8
+; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
+; GFX11-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-NEXT: v_mov_b32_e32 v5, v6
; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
@@ -7735,14 +7718,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB21_4
; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX11-NEXT: v_mov_b32_e32 v8, v6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -7750,14 +7733,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: s_cbranch_execnz .LBB21_3
; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v6
+; GFX11-NEXT: v_mov_b32_e32 v0, v4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -7769,38 +7753,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
; GFX10-NEXT: ; implicit-def: $vgpr4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB21_1
; GFX10-NEXT: ; %bb.2:
; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX10-NEXT: v_max_f32_e32 v6, v10, v7
-; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v8
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v9
+; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v6
-; GFX10-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
-; GFX10-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo
-; GFX10-NEXT: v_perm_b32 v7, v6, v4, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v6, v7
-; GFX10-NEXT: v_mov_b32_e32 v7, v8
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5
+; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v4, v5
+; GFX10-NEXT: v_mov_b32_e32 v5, v6
; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
@@ -7812,15 +7796,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB21_4
; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX10-NEXT: v_mov_b32_e32 v8, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
@@ -7829,13 +7813,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX10-NEXT: s_cbranch_execnz .LBB21_3
; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_mov_b32_e32 v0, v6
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4
+; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -7847,38 +7831,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: ; implicit-def: $vgpr4
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5
; GFX90A-NEXT: s_movk_i32 s14, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX90A-NEXT: s_mov_b32 s15, 0x7060302
; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v9
-; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s14
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v9
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
-; GFX90A-NEXT: v_max_f32_e32 v6, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX90A-NEXT: v_add3_u32 v7, v7, v6, s14
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v8, v6, v4, s15
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_max_f32_e32 v5, v5, v10
+; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14
+; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -7890,27 +7874,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4
+; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -7922,39 +7906,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
; GFX908-NEXT: ; implicit-def: $vgpr4
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB21_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX908-NEXT: s_movk_i32 s14, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX908-NEXT: s_mov_b32 s15, 0x7060302
; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v8
-; GFX908-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX908-NEXT: v_add3_u32 v6, v6, v4, s14
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX908-NEXT: v_max_f32_e32 v4, v4, v8
+; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
-; GFX908-NEXT: v_max_f32_e32 v6, v7, v6
-; GFX908-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX908-NEXT: v_add3_u32 v7, v7, v6, s14
-; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX908-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc
-; GFX908-NEXT: v_perm_b32 v7, v6, v4, s15
-; GFX908-NEXT: v_mov_b32_e32 v6, v7
+; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_max_f32_e32 v5, v5, v9
+; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14
+; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
-; GFX908-NEXT: v_mov_b32_e32 v7, v8
+; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -7966,27 +7950,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB21_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v8, v6
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB21_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v0, v6
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -7998,40 +7982,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
; GFX8-NEXT: ; implicit-def: $vgpr4
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB21_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v8
-; GFX8-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT: v_max_f32_e32 v4, v4, v8
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
-; GFX8-NEXT: v_max_f32_e32 v6, v7, v6
-; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v7, v6, v4, 16
-; GFX8-NEXT: v_mov_b32_e32 v6, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_max_f32_e32 v5, v5, v9
+; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
+; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
-; GFX8-NEXT: v_mov_b32_e32 v7, v8
+; GFX8-NEXT: v_mov_b32_e32 v5, v6
; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -8043,21 +8027,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB21_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v8, v6
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB21_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -8236,19 +8220,19 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: v_mov_b32_e32 v0, s16
; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s6, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, s6
; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: v_max_f32_e32 v1, v2, v2
; GFX940-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v0, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, s6
+; GFX940-NEXT: v_max_f32_e32 v4, v0, v2
; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
@@ -8287,19 +8271,19 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
@@ -8317,25 +8301,25 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v0, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v0, v1
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT: v_max_f32_e32 v4, v0, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB22_1
@@ -8352,19 +8336,19 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4
-; GFX8-NEXT: v_max_f32_e32 v3, v0, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT: v_max_f32_e32 v4, v0, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB22_1
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 0d38ee36415b80..0bcaacc6b08e8a 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -31,19 +31,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: v_mov_b32_e32 v0, s16
; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s6, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, s6
; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: v_max_f32_e32 v1, v2, v2
; GFX940-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v0, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, s6
+; GFX940-NEXT: v_min_f32_e32 v4, v0, v2
; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
@@ -82,19 +82,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -110,25 +110,25 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v0, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v0, v1
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT: v_min_f32_e32 v4, v0, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB0_1
@@ -145,19 +145,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4
-; GFX8-NEXT: v_min_f32_e32 v3, v0, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT: v_min_f32_e32 v4, v0, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB0_1
@@ -207,24 +207,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v1, s16
-; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s6, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v0, v0
+; GFX940-NEXT: v_mov_b32_e32 v3, s6
; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v1, v0, v0
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT: v_min_f32_e32 v2, v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v6, s6
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX940-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX940-NEXT: s_cbranch_execnz .LBB1_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -257,23 +257,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v1, v0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT: v_min_f32_e32 v2, v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, s6
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB1_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -284,24 +284,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v0, v0
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v1, v0, v0
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX908-NEXT: v_min_f32_e32 v1, v3, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB1_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -316,20 +316,20 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1
; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB1_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -400,7 +400,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4
+; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX940-NEXT: s_mov_b64 s[2:3], exec
; GFX940-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -412,22 +412,22 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
+; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
; GFX940-NEXT: ; implicit-def: $vgpr4
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB2_1
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: s_mov_b64 exec, s[2:3]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_max_f32_e32 v9, v5, v5
; GFX940-NEXT: .LBB2_3: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Loop Header: Depth=1
; GFX940-NEXT: ; Child Loop BB2_4 Depth 2
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v6, v9, v9
-; GFX940-NEXT: v_min_f32_e32 v8, v6, v4
+; GFX940-NEXT: v_max_f32_e32 v4, v7, v7
+; GFX940-NEXT: v_min_f32_e32 v6, v4, v9
; GFX940-NEXT: s_mov_b64 s[8:9], exec
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
@@ -441,21 +441,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB2_4
; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
; GFX940-NEXT: s_mov_b64 exec, s[8:9]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v9, v6
+; GFX940-NEXT: v_mov_b32_e32 v7, v4
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB2_3
; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v0, v6
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -520,7 +520,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4
+; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -532,22 +532,22 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: ; implicit-def: $vgpr4
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB2_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_max_f32_e32 v9, v5, v5
; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB2_4 Depth 2
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v6, v9, v9
-; GFX90A-NEXT: v_min_f32_e32 v8, v6, v4
+; GFX90A-NEXT: v_max_f32_e32 v4, v7, v7
+; GFX90A-NEXT: v_min_f32_e32 v6, v4, v9
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -559,27 +559,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB2_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB2_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4
+; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -591,23 +591,23 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
; GFX908-NEXT: ; implicit-def: $vgpr4
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB2_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_max_f32_e32 v8, v5, v5
; GFX908-NEXT: .LBB2_3: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB2_4 Depth 2
-; GFX908-NEXT: v_max_f32_e32 v4, v5, v5
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v6, v8, v8
-; GFX908-NEXT: v_min_f32_e32 v7, v6, v4
-; GFX908-NEXT: v_mov_b32_e32 v6, v7
+; GFX908-NEXT: v_max_f32_e32 v4, v6, v6
+; GFX908-NEXT: v_min_f32_e32 v5, v4, v8
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
-; GFX908-NEXT: v_mov_b32_e32 v7, v8
+; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -619,21 +619,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB2_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v8, v6
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB2_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v0, v6
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -771,19 +771,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: v_mov_b32_e32 v0, s16
; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s6, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, s6
; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: v_max_f32_e32 v1, v2, v2
; GFX940-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v0, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, s6
+; GFX940-NEXT: v_min_f32_e32 v4, v0, v2
; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
@@ -800,28 +800,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x400
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v0
-; GFX11-NEXT: v_max_f32_e32 v0, v2, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v1, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX11-NEXT: v_min_f32_e32 v4, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v0, v3
-; GFX11-NEXT: v_mov_b32_e32 v1, v4
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -833,27 +832,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x400
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_max_f32_e32 v2, v1, v1
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v2, v2
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f32_e32 v1, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v1, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX10-NEXT: v_min_f32_e32 v4, v0, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB3_1
@@ -864,19 +863,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -892,25 +891,25 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v0, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v0, v1
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT: v_min_f32_e32 v4, v0, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB3_1
@@ -927,19 +926,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4
-; GFX8-NEXT: v_min_f32_e32 v3, v0, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT: v_min_f32_e32 v4, v0, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB3_1
@@ -956,19 +955,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX7-NEXT: v_mov_b32_e32 v3, s6
; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4
-; GFX7-NEXT: v_min_f32_e32 v3, v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX7-NEXT: v_min_f32_e32 v4, v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB3_1
@@ -985,20 +984,20 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX6-NEXT: v_mov_b32_e32 v3, s6
; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4
-; GFX6-NEXT: v_min_f32_e32 v3, v0, v2
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX6-NEXT: v_min_f32_e32 v4, v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v0, v4
+; GFX6-NEXT: v_mov_b32_e32 v1, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB3_1
@@ -1029,19 +1028,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: v_mov_b32_e32 v0, s16
; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s6, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, s6
; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: v_max_f32_e32 v1, v2, v2
; GFX940-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v0, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, s6
+; GFX940-NEXT: v_min_f32_e32 v4, v0, v2
; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
@@ -1080,19 +1079,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -1108,25 +1107,25 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v0, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v0, v1
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT: v_min_f32_e32 v4, v0, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB4_1
@@ -1143,19 +1142,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4
-; GFX8-NEXT: v_min_f32_e32 v3, v0, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT: v_min_f32_e32 v4, v0, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB4_1
@@ -1198,29 +1197,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
-; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[0:1]
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v10, s5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1243,30 +1243,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x800
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_add_i32 s4, s16, 0x800
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1]
-; GFX11-NEXT: v_mov_b32_e32 v10, s5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1298,29 +1298,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v6, s6
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v9, v1
-; GFX908-NEXT: v_mov_b32_e32 v8, v0
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX908-NEXT: v_mov_b32_e32 v10, s6
-; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX908-NEXT: v_mov_b32_e32 v0, v6
-; GFX908-NEXT: v_mov_b32_e32 v1, v7
-; GFX908-NEXT: v_mov_b32_e32 v2, v8
-; GFX908-NEXT: v_mov_b32_e32 v3, v9
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v10, v1
+; GFX908-NEXT: v_mov_b32_e32 v9, v0
+; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
+; GFX908-NEXT: v_mov_b32_e32 v2, v9
+; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB5_1
@@ -1331,29 +1331,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v10, s6
-; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
-; GFX8-NEXT: v_mov_b32_e32 v1, v7
-; GFX8-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NEXT: v_mov_b32_e32 v3, v9
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v10, v1
+; GFX8-NEXT: v_mov_b32_e32 v9, v0
+; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
+; GFX8-NEXT: v_mov_b32_e32 v2, v9
+; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB5_1
@@ -1393,27 +1393,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, s16
-; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
; GFX12-NEXT: s_mov_b32 s4, 0
-; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[0:1]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[4:5], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[6:7], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5
-; GFX12-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT: v_mov_b32_e32 v6, v2
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1437,28 +1437,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x800
+; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX11-NEXT: s_add_i32 s4, s16, 0x800
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
-; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5]
+; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f64 v[2:3], v[6:7], v[2:3]
-; GFX11-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5
-; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_mov_b32_e32 v6, v2
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc
+; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1491,27 +1490,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v2, s20
-; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
+; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v6, s6
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v10, s6
-; GFX908-NEXT: v_min_f64 v[2:3], v[6:7], v[2:3]
-; GFX908-NEXT: v_mov_b32_e32 v9, v5
-; GFX908-NEXT: v_mov_b32_e32 v8, v4
-; GFX908-NEXT: v_mov_b32_e32 v7, v3
-; GFX908-NEXT: v_mov_b32_e32 v6, v2
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v10, v3
+; GFX908-NEXT: v_mov_b32_e32 v9, v2
+; GFX908-NEXT: v_mov_b32_e32 v8, v1
+; GFX908-NEXT: v_mov_b32_e32 v7, v0
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v6
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX908-NEXT: v_mov_b32_e32 v2, v7
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v5, v7
+; GFX908-NEXT: v_mov_b32_e32 v3, v8
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB6_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1522,27 +1521,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s20
-; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
+; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v10, s6
-; GFX8-NEXT: v_min_f64 v[2:3], v[6:7], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v9, v5
-; GFX8-NEXT: v_mov_b32_e32 v8, v4
-; GFX8-NEXT: v_mov_b32_e32 v7, v3
-; GFX8-NEXT: v_mov_b32_e32 v6, v2
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc
+; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v10, v3
+; GFX8-NEXT: v_mov_b32_e32 v9, v2
+; GFX8-NEXT: v_mov_b32_e32 v8, v1
+; GFX8-NEXT: v_mov_b32_e32 v7, v0
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v6
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v2, v7
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v5, v7
+; GFX8-NEXT: v_mov_b32_e32 v3, v8
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB6_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1606,17 +1605,17 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Loop Header: Depth=1
; GFX12-NEXT: ; Child Loop BB7_4 Depth 2
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[5:6], v[5:6]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[13:14], v[13:14]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[2:3], v[0:1]
+; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1
@@ -1710,17 +1709,17 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: s_cbranch_execnz .LBB7_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6]
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB7_4 Depth 2
-; GFX11-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14]
+; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14]
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f64 v[11:12], v[2:3], v[0:1]
+; GFX11-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5]
; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1
@@ -1838,15 +1837,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
+; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB7_4 Depth 2
-; GFX908-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14]
+; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14]
; GFX908-NEXT: s_mov_b64 s[12:13], exec
-; GFX908-NEXT: v_min_f64 v[11:12], v[2:3], v[0:1]
+; GFX908-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5]
; GFX908-NEXT: v_mov_b32_e32 v0, v11
; GFX908-NEXT: v_mov_b32_e32 v1, v12
; GFX908-NEXT: v_mov_b32_e32 v2, v13
@@ -1904,15 +1903,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX8-NEXT: s_cbranch_execnz .LBB7_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
+; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB7_4 Depth 2
-; GFX8-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14]
+; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14]
; GFX8-NEXT: s_mov_b64 s[12:13], exec
-; GFX8-NEXT: v_min_f64 v[11:12], v[2:3], v[0:1]
+; GFX8-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5]
; GFX8-NEXT: v_mov_b32_e32 v0, v11
; GFX8-NEXT: v_mov_b32_e32 v1, v12
; GFX8-NEXT: v_mov_b32_e32 v2, v13
@@ -2012,29 +2011,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
-; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[0:1]
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v10, s5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2057,30 +2057,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x800
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_add_i32 s4, s16, 0x800
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1]
-; GFX11-NEXT: v_mov_b32_e32 v10, s5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2092,31 +2092,31 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: s_add_i32 s5, s20, 0x800
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-NEXT: s_add_i32 s4, s20, 0x800
+; GFX10-NEXT: v_mov_b32_e32 v6, s4
; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v9, v1
-; GFX10-NEXT: v_mov_b32_e32 v8, v0
-; GFX10-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v10, s5
+; GFX10-NEXT: v_mov_b32_e32 v10, v1
+; GFX10-NEXT: v_mov_b32_e32 v9, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX10-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v0, v6
-; GFX10-NEXT: v_mov_b32_e32 v1, v7
-; GFX10-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-NEXT: v_mov_b32_e32 v3, v9
-; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX10-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX10-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v0, v7
+; GFX10-NEXT: v_mov_b32_e32 v1, v8
+; GFX10-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-NEXT: v_mov_b32_e32 v3, v10
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB8_1
@@ -2127,26 +2127,26 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX90A-NEXT: s_add_i32 s6, s20, 0x800
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX90A-NEXT: v_mov_b32_e32 v6, s6
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX90A-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v10, s6
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX90A-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
@@ -2157,29 +2157,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v6, s6
; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v9, v1
-; GFX908-NEXT: v_mov_b32_e32 v8, v0
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX908-NEXT: v_mov_b32_e32 v10, s6
-; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX908-NEXT: v_mov_b32_e32 v0, v6
-; GFX908-NEXT: v_mov_b32_e32 v1, v7
-; GFX908-NEXT: v_mov_b32_e32 v2, v8
-; GFX908-NEXT: v_mov_b32_e32 v3, v9
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v10, v1
+; GFX908-NEXT: v_mov_b32_e32 v9, v0
+; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
+; GFX908-NEXT: v_mov_b32_e32 v2, v9
+; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB8_1
@@ -2190,29 +2190,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v10, s6
-; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
-; GFX8-NEXT: v_mov_b32_e32 v1, v7
-; GFX8-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NEXT: v_mov_b32_e32 v3, v9
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v10, v1
+; GFX8-NEXT: v_mov_b32_e32 v9, v0
+; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
+; GFX8-NEXT: v_mov_b32_e32 v2, v9
+; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB8_1
@@ -2223,29 +2223,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s20
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX7-NEXT: s_add_i32 s6, s20, 0x800
+; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s6
; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v10, s6
-; GFX7-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, v6
-; GFX7-NEXT: v_mov_b32_e32 v1, v7
-; GFX7-NEXT: v_mov_b32_e32 v2, v8
-; GFX7-NEXT: v_mov_b32_e32 v3, v9
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v10, v1
+; GFX7-NEXT: v_mov_b32_e32 v9, v0
+; GFX7-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX7-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v7
+; GFX7-NEXT: v_mov_b32_e32 v1, v8
+; GFX7-NEXT: v_mov_b32_e32 v2, v9
+; GFX7-NEXT: v_mov_b32_e32 v3, v10
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB8_1
@@ -2256,30 +2256,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
+; GFX6-NEXT: v_mov_b32_e32 v2, v0
; GFX6-NEXT: v_mov_b32_e32 v0, s20
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
+; GFX6-NEXT: v_mov_b32_e32 v3, v1
; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
; GFX6-NEXT: s_add_i32 s6, s20, 0x800
+; GFX6-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX6-NEXT: s_mov_b64 s[4:5], 0
+; GFX6-NEXT: v_mov_b32_e32 v6, s6
; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v9, v1
-; GFX6-NEXT: v_mov_b32_e32 v8, v0
+; GFX6-NEXT: v_mov_b32_e32 v10, v1
+; GFX6-NEXT: v_mov_b32_e32 v9, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX6-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v10, s6
-; GFX6-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX6-NEXT: v_mov_b32_e32 v0, v6
-; GFX6-NEXT: v_mov_b32_e32 v1, v7
-; GFX6-NEXT: v_mov_b32_e32 v2, v8
-; GFX6-NEXT: v_mov_b32_e32 v3, v9
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX6-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX6-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v0, v7
+; GFX6-NEXT: v_mov_b32_e32 v1, v8
+; GFX6-NEXT: v_mov_b32_e32 v2, v9
+; GFX6-NEXT: v_mov_b32_e32 v3, v10
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB8_1
@@ -2300,29 +2300,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
-; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[0:1]
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v10, s5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2345,30 +2346,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x800
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_add_i32 s4, s16, 0x800
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1]
-; GFX11-NEXT: v_mov_b32_e32 v10, s5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2400,29 +2401,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v6, s6
; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v9, v1
-; GFX908-NEXT: v_mov_b32_e32 v8, v0
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX908-NEXT: v_mov_b32_e32 v10, s6
-; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX908-NEXT: v_mov_b32_e32 v0, v6
-; GFX908-NEXT: v_mov_b32_e32 v1, v7
-; GFX908-NEXT: v_mov_b32_e32 v2, v8
-; GFX908-NEXT: v_mov_b32_e32 v3, v9
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v10, v1
+; GFX908-NEXT: v_mov_b32_e32 v9, v0
+; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
+; GFX908-NEXT: v_mov_b32_e32 v2, v9
+; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB9_1
@@ -2433,29 +2434,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v10, s6
-; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
-; GFX8-NEXT: v_mov_b32_e32 v1, v7
-; GFX8-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NEXT: v_mov_b32_e32 v3, v9
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v10, v1
+; GFX8-NEXT: v_mov_b32_e32 v9, v0
+; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
+; GFX8-NEXT: v_mov_b32_e32 v2, v9
+; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB9_1
@@ -2499,47 +2500,47 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_and_b32 s5, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v1, s4
-; GFX12-NEXT: s_lshl_b32 s5, s5, 3
+; GFX12-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-NEXT: s_and_b32 s4, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_not_b32 s7, s6
-; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen
-; GFX12-NEXT: s_mov_b32 s6, 0
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_not_b32 s6, s5
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v1, s5, v2
-; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1
-; GFX12-NEXT: v_min_num_f16_e32 v1, v1, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, s5, v1
+; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT: v_mov_b32_e32 v3, v1
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v3
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -2547,263 +2548,256 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s16, 0x200
-; GFX940-NEXT: s_and_b32 s6, s16, -4
-; GFX940-NEXT: v_mov_b32_e32 v1, s6
-; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
+; GFX940-NEXT: s_and_b32 s4, s16, -4
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
+; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen
; GFX940-NEXT: s_and_b32 s4, s16, 3
-; GFX940-NEXT: s_lshl_b32 s7, s4, 3
-; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT: s_not_b32 s8, s4
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_max_f16_e32 v5, v0, v0
; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v1, s7, v3
-; GFX940-NEXT: v_max_f16_e32 v2, v0, v0
-; GFX940-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX940-NEXT: v_min_f16_e32 v1, v1, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX940-NEXT: v_and_or_b32 v2, v3, s8, v1
-; GFX940-NEXT: v_mov_b32_e32 v6, s6
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX940-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX940-NEXT: v_min_f16_e32 v0, v0, v5
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX940-NEXT: s_cbranch_execnz .LBB10_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, s7, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_max_f16_e32 v5, v0, v0
; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_and_b32 s5, s16, 3
-; GFX11-NEXT: v_mov_b32_e32 v1, s4
-; GFX11-NEXT: s_lshl_b32 s5, s5, 3
-; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-NEXT: s_and_b32 s4, s16, 3
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_not_b32 s7, s6
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen
-; GFX11-NEXT: s_mov_b32 s6, 0
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-NEXT: s_not_b32 s6, s5
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, s5, v2
-; GFX11-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX11-NEXT: v_min_f16_e32 v1, v1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, s5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX11-NEXT: v_mov_b32_e32 v3, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT: v_min_f16_e32 v0, v0, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB10_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v3
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
+; GFX10-NEXT: v_max_f16_e32 v5, v0, v0
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: s_and_b32 s5, s20, 3
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: s_lshl_b32 s5, s5, 3
-; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT: s_not_b32 s7, s6
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
-; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: s_and_b32 s4, s20, 3
+; GFX10-NEXT: s_lshl_b32 s4, s4, 3
+; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX10-NEXT: s_not_b32 s6, s5
+; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2
-; GFX10-NEXT: v_max_f16_e32 v3, v0, v0
-; GFX10-NEXT: v_mov_b32_e32 v5, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX10-NEXT: v_min_f16_e32 v1, v1, v3
-; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
+; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT: v_min_f16_e32 v0, v0, v5
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v2
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB10_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v3
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
-; GFX90A-NEXT: s_and_b32 s6, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen
+; GFX90A-NEXT: s_and_b32 s4, s20, -4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
-; GFX90A-NEXT: s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT: s_not_b32 s8, s4
+; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3
-; GFX90A-NEXT: v_max_f16_e32 v2, v0, v0
-; GFX90A-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX90A-NEXT: v_min_f16_e32 v1, v1, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, s6
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX90A-NEXT: v_min_f16_e32 v0, v0, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB10_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
-; GFX908-NEXT: s_and_b32 s6, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v1, s6
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX908-NEXT: s_and_b32 s4, s20, -4
+; GFX908-NEXT: v_mov_b32_e32 v4, s4
+; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
-; GFX908-NEXT: s_lshl_b32 s7, s4, 3
-; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT: s_not_b32 s8, s4
+; GFX908-NEXT: s_lshl_b32 s6, s4, 3
+; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v5, v0, v0
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2
-; GFX908-NEXT: v_max_f16_e32 v3, v0, v0
-; GFX908-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX908-NEXT: v_min_f16_e32 v1, v1, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX908-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX908-NEXT: v_min_f16_e32 v0, v0, v5
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v1, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB10_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v3
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
-; GFX8-NEXT: s_and_b32 s6, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v1, s6
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX8-NEXT: s_and_b32 s4, s20, -4
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
-; GFX8-NEXT: s_lshl_b32 s7, s4, 3
-; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT: s_not_b32 s8, s4
+; GFX8-NEXT: s_lshl_b32 s6, s4, 3
+; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v5, v0, v0
; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2
-; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT: v_min_f16_e32 v1, v1, v3
-; GFX8-NEXT: v_and_b32_e32 v4, s8, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_min_f16_e32 v0, v0, v5
+; GFX8-NEXT: v_and_b32_e32 v2, s7, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB10_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
-; GFX7-NEXT: s_and_b32 s6, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v1, s6
-; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT: s_and_b32 s4, s20, -4
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_and_b32 s4, s20, 3
-; GFX7-NEXT: s_lshl_b32 s7, s4, 3
-; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0
-; GFX7-NEXT: s_not_b32 s8, s4
+; GFX7-NEXT: s_lshl_b32 s6, s4, 3
+; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v2, s8, v1
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
-; GFX7-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_and_b32_e32 v2, s7, v1
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
; GFX7-NEXT: v_mov_b32_e32 v3, v1
; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -2813,7 +2807,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX7-NEXT: s_cbranch_execnz .LBB10_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -2821,31 +2815,30 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
-; GFX6-NEXT: s_and_b32 s6, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v1, s6
-; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT: s_and_b32 s4, s20, -4
+; GFX6-NEXT: v_mov_b32_e32 v4, s4
+; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_and_b32 s4, s20, 3
-; GFX6-NEXT: s_lshl_b32 s7, s4, 3
-; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v0
-; GFX6-NEXT: s_not_b32 s8, s4
+; GFX6-NEXT: s_lshl_b32 s6, s4, 3
+; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, s8, v1
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
-; GFX6-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX6-NEXT: v_and_b32_e32 v2, s7, v1
+; GFX6-NEXT: v_min_f32_e32 v0, v0, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
; GFX6-NEXT: v_mov_b32_e32 v3, v1
; GFX6-NEXT: v_mov_b32_e32 v2, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -2855,7 +2848,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX6-NEXT: s_cbranch_execnz .LBB10_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -2873,46 +2866,46 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_and_b32 s5, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v1, s4
-; GFX12-NEXT: s_lshl_b32 s5, s5, 3
+; GFX12-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-NEXT: s_and_b32 s4, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_not_b32 s7, s6
-; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen
-; GFX12-NEXT: s_mov_b32 s6, 0
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_not_b32 s6, s5
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v1, s5, v2
-; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1
-; GFX12-NEXT: v_min_num_f16_e32 v1, v1, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, s5, v1
+; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
+; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT: v_mov_b32_e32 v3, v1
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -2920,32 +2913,31 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s16, 0x200
-; GFX940-NEXT: s_and_b32 s6, s16, -4
-; GFX940-NEXT: v_mov_b32_e32 v1, s6
-; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
+; GFX940-NEXT: s_and_b32 s4, s16, -4
+; GFX940-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
; GFX940-NEXT: s_and_b32 s4, s16, 3
-; GFX940-NEXT: s_lshl_b32 s7, s4, 3
-; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT: s_not_b32 s8, s4
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_max_f16_e32 v3, v0, v0
; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v1, s7, v3
-; GFX940-NEXT: v_max_f16_e32 v2, v0, v0
-; GFX940-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX940-NEXT: v_min_f16_e32 v1, v1, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX940-NEXT: v_and_or_b32 v2, v3, s8, v1
-; GFX940-NEXT: v_mov_b32_e32 v6, s6
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX940-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX940-NEXT: v_min_f16_e32 v0, v0, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX940-NEXT: s_cbranch_execnz .LBB11_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2956,114 +2948,111 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_max_f16_e32 v3, v0, v0
; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_and_b32 s5, s16, 3
-; GFX11-NEXT: v_mov_b32_e32 v1, s4
-; GFX11-NEXT: s_lshl_b32 s5, s5, 3
-; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-NEXT: s_and_b32 s4, s16, 3
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_not_b32 s7, s6
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen
-; GFX11-NEXT: s_mov_b32 s6, 0
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-NEXT: s_not_b32 s6, s5
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, s5, v2
-; GFX11-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX11-NEXT: v_min_f16_e32 v1, v1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, s5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2
-; GFX11-NEXT: v_mov_b32_e32 v3, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT: v_min_f16_e32 v0, v0, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB11_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
+; GFX10-NEXT: v_max_f16_e32 v3, v0, v0
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: s_and_b32 s5, s20, 3
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: s_lshl_b32 s5, s5, 3
-; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT: s_not_b32 s7, s6
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
-; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-NEXT: s_and_b32 s4, s20, 3
+; GFX10-NEXT: s_lshl_b32 s4, s4, 3
+; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX10-NEXT: s_not_b32 s6, s5
+; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2
-; GFX10-NEXT: v_max_f16_e32 v3, v0, v0
-; GFX10-NEXT: v_mov_b32_e32 v5, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX10-NEXT: v_min_f16_e32 v1, v1, v3
-; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT: v_min_f16_e32 v0, v0, v3
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
+; GFX10-NEXT: v_mov_b32_e32 v4, v0
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB11_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
-; GFX90A-NEXT: s_and_b32 s6, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen
+; GFX90A-NEXT: s_and_b32 s4, s20, -4
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
-; GFX90A-NEXT: s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT: s_not_b32 s8, s4
+; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0
; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3
-; GFX90A-NEXT: v_max_f16_e32 v2, v0, v0
-; GFX90A-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX90A-NEXT: v_min_f16_e32 v1, v1, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, s6
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX90A-NEXT: v_min_f16_e32 v0, v0, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB11_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3074,32 +3063,31 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
-; GFX908-NEXT: s_and_b32 s6, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v1, s6
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX908-NEXT: s_and_b32 s4, s20, -4
+; GFX908-NEXT: v_mov_b32_e32 v2, s4
+; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
-; GFX908-NEXT: s_lshl_b32 s7, s4, 3
-; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT: s_not_b32 s8, s4
+; GFX908-NEXT: s_lshl_b32 s6, s4, 3
+; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v3, v0, v0
; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2
-; GFX908-NEXT: v_max_f16_e32 v3, v0, v0
-; GFX908-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX908-NEXT: v_min_f16_e32 v1, v1, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX908-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX908-NEXT: v_min_f16_e32 v0, v0, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB11_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3110,33 +3098,32 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
-; GFX8-NEXT: s_and_b32 s6, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v1, s6
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX8-NEXT: s_and_b32 s4, s20, -4
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
-; GFX8-NEXT: s_lshl_b32 s7, s4, 3
-; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT: s_not_b32 s8, s4
+; GFX8-NEXT: s_lshl_b32 s6, s4, 3
+; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2
-; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT: v_min_f16_e32 v1, v1, v3
-; GFX8-NEXT: v_and_b32_e32 v4, s8, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1
-; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_min_f16_e32 v0, v0, v3
+; GFX8-NEXT: v_and_b32_e32 v4, s7, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB11_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3147,35 +3134,34 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
-; GFX7-NEXT: s_and_b32 s6, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v1, s6
-; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT: s_and_b32 s4, s20, -4
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_and_b32 s4, s20, 3
-; GFX7-NEXT: s_lshl_b32 s7, s4, 3
-; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX7-NEXT: s_not_b32 s8, s4
+; GFX7-NEXT: s_lshl_b32 s6, s4, 3
+; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v3, s8, v1
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
-; GFX7-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_and_b32_e32 v4, s7, v1
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_mov_b32_e32 v4, v0
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v3
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB11_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3186,36 +3172,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
-; GFX6-NEXT: s_and_b32 s6, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v1, s6
-; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT: s_and_b32 s4, s20, -4
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_and_b32 s4, s20, 3
-; GFX6-NEXT: s_lshl_b32 s7, s4, 3
-; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-NEXT: s_not_b32 s8, s4
+; GFX6-NEXT: s_lshl_b32 s6, s4, 3
+; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v3, s8, v1
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
-; GFX6-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_and_b32_e32 v4, s7, v1
+; GFX6-NEXT: v_min_f32_e32 v0, v0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
+; GFX6-NEXT: v_mov_b32_e32 v4, v0
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v3
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB11_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3235,15 +3220,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
+; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX12-NEXT: s_mov_b32 s1, exec_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v6
-; GFX12-NEXT: v_and_b32_e32 v10, -4, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX12-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v11, v7
+; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX12-NEXT: v_not_b32_e32 v9, v6
; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
@@ -3258,30 +3243,31 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen
+; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Loop Header: Depth=1
; GFX12-NEXT: ; Child Loop BB12_4 Depth 2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX12-NEXT: v_max_num_f16_e32 v8, v5, v5
+; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v6, v6, v6
-; GFX12-NEXT: v_min_num_f16_e32 v6, v6, v8
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4
+; GFX12-NEXT: v_min_num_f16_e32 v4, v4, v10
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-NEXT: v_mov_b32_e32 v5, v6
; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -3297,15 +3283,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX12-NEXT: v_mov_b32_e32 v7, v8
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -3313,7 +3299,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_cbranch_execnz .LBB12_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -3321,12 +3307,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX940-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX940-NEXT: v_and_b32_e32 v9, -4, v4
; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4
; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v6, v4, s0
-; GFX940-NEXT: v_not_b32_e32 v11, v6
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0
+; GFX940-NEXT: v_not_b32_e32 v10, v4
; GFX940-NEXT: s_mov_b64 s[2:3], exec
; GFX940-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -3338,24 +3324,24 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen
+; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB12_1
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: s_mov_b64 exec, s[2:3]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_max_f16_e32 v11, v5, v5
; GFX940-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Loop Header: Depth=1
; GFX940-NEXT: ; Child Loop BB12_4 Depth 2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX940-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT: v_max_f16_e32 v8, v5, v5
-; GFX940-NEXT: v_min_f16_e32 v6, v6, v8
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX940-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7
+; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX940-NEXT: v_min_f16_e32 v4, v4, v11
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, v8, v4
+; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX940-NEXT: s_mov_b64 s[8:9], exec
-; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
@@ -3369,36 +3355,36 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB12_4
; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX940-NEXT: s_mov_b64 exec, s[8:9]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v7, v8
+; GFX940-NEXT: v_mov_b32_e32 v7, v4
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB12_3
; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v6
-; GFX11-NEXT: v_and_b32_e32 v10, -4, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX11-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v11, v7
+; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX11-NEXT: v_not_b32_e32 v9, v6
; GFX11-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_readfirstlane_b32 s5, v1
@@ -3410,29 +3396,30 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen
+; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB12_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: v_max_f16_e32 v10, v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB12_4 Depth 2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX11-NEXT: v_max_f16_e32 v8, v5, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX11-NEXT: v_min_f16_e32 v6, v6, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX11-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX11-NEXT: v_min_f16_e32 v4, v4, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-NEXT: v_mov_b32_e32 v5, v6
; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
@@ -3446,14 +3433,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB12_4
; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX11-NEXT: v_mov_b32_e32 v7, v8
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -3462,20 +3449,20 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX11-NEXT: s_cbranch_execnz .LBB12_3
; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
-; GFX10-NEXT: v_and_b32_e32 v4, 3, v6
-; GFX10-NEXT: v_and_b32_e32 v10, -4, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v11, v7
+; GFX10-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX10-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v9, v6
; GFX10-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
; GFX10-NEXT: v_readfirstlane_b32 s9, v1
@@ -3485,26 +3472,26 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB12_1
; GFX10-NEXT: ; %bb.2:
; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: v_max_f16_e32 v10, v5, v5
; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB12_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX10-NEXT: v_max_f16_e32 v8, v5, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX10-NEXT: v_min_f16_e32 v6, v6, v8
-; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX10-NEXT: v_mov_b32_e32 v9, v7
-; GFX10-NEXT: v_mov_b32_e32 v8, v6
+; GFX10-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX10-NEXT: v_min_f16_e32 v4, v4, v10
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v5
+; GFX10-NEXT: v_mov_b32_e32 v5, v6
; GFX10-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
@@ -3516,15 +3503,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB12_4
; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX10-NEXT: v_mov_b32_e32 v7, v8
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
@@ -3533,19 +3520,19 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX10-NEXT: s_cbranch_execnz .LBB12_3
; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX90A-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4
; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v6, v4, s4
-; GFX90A-NEXT: v_not_b32_e32 v11, v6
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4
+; GFX90A-NEXT: v_not_b32_e32 v10, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -3557,24 +3544,24 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5
; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT: v_max_f16_e32 v8, v5, v5
-; GFX90A-NEXT: v_min_f16_e32 v6, v6, v8
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX90A-NEXT: v_min_f16_e32 v4, v4, v11
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -3586,33 +3573,33 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v7, v8
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX908-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX908-NEXT: v_and_b32_e32 v8, -4, v4
; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4
; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4
-; GFX908-NEXT: v_not_b32_e32 v11, v6
+; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4
+; GFX908-NEXT: v_not_b32_e32 v9, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -3624,25 +3611,25 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB12_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_max_f16_e32 v10, v5, v5
; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB12_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX908-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX908-NEXT: v_max_f16_e32 v8, v5, v5
-; GFX908-NEXT: v_min_f16_e32 v6, v6, v8
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX908-NEXT: v_mov_b32_e32 v9, v7
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX908-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX908-NEXT: v_min_f16_e32 v4, v4, v10
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
-; GFX908-NEXT: v_mov_b32_e32 v8, v6
+; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -3654,33 +3641,33 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB12_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v8
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB12_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4
-; GFX8-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX8-NEXT: v_and_b32_e32 v8, -4, v4
; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4
; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4
-; GFX8-NEXT: v_not_b32_e32 v11, v6
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4
+; GFX8-NEXT: v_not_b32_e32 v9, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -3692,26 +3679,26 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB12_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_max_f16_e32 v10, v5, v5
; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB12_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX8-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX8-NEXT: v_max_f16_e32 v8, v5, v5
-; GFX8-NEXT: v_min_f16_e32 v6, v6, v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX8-NEXT: v_and_b32_e32 v8, v7, v11
-; GFX8-NEXT: v_or_b32_e32 v6, v8, v6
-; GFX8-NEXT: v_mov_b32_e32 v9, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT: v_min_f16_e32 v4, v4, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX8-NEXT: v_and_b32_e32 v5, v6, v9
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
-; GFX8-NEXT: v_mov_b32_e32 v8, v6
+; GFX8-NEXT: v_mov_b32_e32 v5, v6
; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -3723,21 +3710,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB12_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v8
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB12_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -3898,27 +3885,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_and_b32 s5, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v1, s4
-; GFX12-NEXT: s_lshl_b32 s5, s5, 3
+; GFX12-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-NEXT: s_and_b32 s4, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_not_b32 s7, s6
-; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen
-; GFX12-NEXT: s_mov_b32 s6, 0
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_not_b32 s6, s5
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v4
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0
@@ -3928,23 +3916,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, s5, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v2
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -3952,33 +3940,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s16, 0x200
-; GFX940-NEXT: s_and_b32 s6, s16, -4
-; GFX940-NEXT: v_mov_b32_e32 v1, s6
-; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX940-NEXT: s_and_b32 s4, s16, -4
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
+; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen
; GFX940-NEXT: s_and_b32 s4, s16, 3
-; GFX940-NEXT: s_lshl_b32 s7, s4, 3
-; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT: s_not_b32 s8, s4
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX940-NEXT: s_movk_i32 s9, 0x7fff
+; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: v_mov_b32_e32 v5, s6
-; GFX940-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_min_f32_e32 v0, v0, v5
; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v2, v2, v0, s9
+; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v0, v1, s8, v0
+; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
-; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -3988,32 +3975,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX940-NEXT: s_cbranch_execnz .LBB13_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_and_b32 s5, s16, 3
-; GFX11-NEXT: v_mov_b32_e32 v1, s4
-; GFX11-NEXT: s_lshl_b32 s5, s5, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5
-; GFX11-NEXT: s_not_b32 s7, s6
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen
-; GFX11-NEXT: s_mov_b32 s6, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-NEXT: s_and_b32 s4, s16, 3
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-NEXT: s_not_b32 s6, s5
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_min_f32_e32 v0, v0, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0
@@ -4023,97 +4011,95 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, s5, v0
-; GFX11-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB13_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: s_and_b32 s5, s20, 3
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: s_lshl_b32 s5, s5, 3
-; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT: s_not_b32 s7, s6
-; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
-; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: s_and_b32 s4, s20, 3
+; GFX10-NEXT: s_lshl_b32 s4, s4, 3
+; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX10-NEXT: s_not_b32 s6, s5
+; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_mov_b32_e32 v5, s4
+; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_min_f32_e32 v0, v0, v5
; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX10-NEXT: v_mov_b32_e32 v3, v1
; GFX10-NEXT: v_mov_b32_e32 v2, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v2
-; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB13_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v2
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
-; GFX90A-NEXT: s_and_b32 s6, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX90A-NEXT: s_and_b32 s4, s20, -4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
-; GFX90A-NEXT: s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT: s_not_b32 s8, s4
+; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX90A-NEXT: s_movk_i32 s9, 0x7fff
+; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_min_f32_e32 v0, v0, v5
; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s9
+; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, s6
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4123,39 +4109,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
-; GFX908-NEXT: s_and_b32 s6, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v1, s6
-; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX908-NEXT: s_and_b32 s4, s20, -4
+; GFX908-NEXT: v_mov_b32_e32 v4, s4
+; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
-; GFX908-NEXT: s_lshl_b32 s7, s4, 3
-; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT: s_not_b32 s8, s4
+; GFX908-NEXT: s_lshl_b32 s6, s4, 3
+; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX908-NEXT: s_movk_i32 s9, 0x7fff
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_min_f32_e32 v0, v0, v5
; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v2, v2, v0, s9
+; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0
+; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
; GFX908-NEXT: v_mov_b32_e32 v2, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4165,41 +4150,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
-; GFX8-NEXT: s_and_b32 s6, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v1, s6
-; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX8-NEXT: s_and_b32 s4, s20, -4
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
-; GFX8-NEXT: s_lshl_b32 s7, s4, 3
-; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT: s_not_b32 s8, s4
+; GFX8-NEXT: s_lshl_b32 s6, s4, 3
+; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_min_f32_e32 v3, v3, v5
; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v2, s8, v1
+; GFX8-NEXT: v_and_b32_e32 v2, s7, v1
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4209,38 +4193,37 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
-; GFX7-NEXT: s_and_b32 s6, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v1, s6
-; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT: s_and_b32 s4, s20, -4
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX7-NEXT: s_and_b32 s4, s20, 3
-; GFX7-NEXT: s_lshl_b32 s7, s4, 3
-; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7
+; GFX7-NEXT: s_lshl_b32 s6, s4, 3
+; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: s_not_b32 s8, s4
+; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v5
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v2, s8, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0
+; GFX7-NEXT: v_and_b32_e32 v2, s7, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4250,7 +4233,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX7-NEXT: s_cbranch_execnz .LBB13_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -4258,32 +4241,31 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
-; GFX6-NEXT: s_and_b32 s6, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v1, s6
-; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT: s_and_b32 s4, s20, -4
+; GFX6-NEXT: v_mov_b32_e32 v4, s4
+; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX6-NEXT: s_and_b32 s4, s20, 3
-; GFX6-NEXT: s_lshl_b32 s7, s4, 3
-; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7
+; GFX6-NEXT: s_lshl_b32 s6, s4, 3
+; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: s_not_b32 s8, s4
+; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX6-NEXT: v_min_f32_e32 v0, v0, v5
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, s8, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0
+; GFX6-NEXT: v_and_b32_e32 v2, s7, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
; GFX6-NEXT: v_mov_b32_e32 v2, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
@@ -4293,7 +4275,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX6-NEXT: s_cbranch_execnz .LBB13_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -4311,52 +4293,53 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_and_b32 s5, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v1, s4
-; GFX12-NEXT: s_lshl_b32 s5, s5, 3
+; GFX12-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-NEXT: s_and_b32 s4, s16, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_not_b32 s7, s6
-; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen
-; GFX12-NEXT: s_mov_b32 s6, 0
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_not_b32 s6, s5
+; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, s5, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v3
-; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -4364,33 +4347,32 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_addk_i32 s16, 0x200
-; GFX940-NEXT: s_and_b32 s6, s16, -4
-; GFX940-NEXT: v_mov_b32_e32 v1, s6
-; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX940-NEXT: s_and_b32 s4, s16, -4
+; GFX940-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
; GFX940-NEXT: s_and_b32 s4, s16, 3
-; GFX940-NEXT: s_lshl_b32 s7, s4, 3
-; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX940-NEXT: s_not_b32 s8, s4
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
; GFX940-NEXT: s_mov_b64 s[4:5], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX940-NEXT: s_movk_i32 s9, 0x7fff
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: v_mov_b32_e32 v3, s6
-; GFX940-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_min_f32_e32 v0, v0, v3
; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v4, v4, v0, s9
+; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: s_nop 1
; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v0, v1, s8, v0
+; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
@@ -4406,123 +4388,122 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_and_b32 s5, s16, 3
-; GFX11-NEXT: v_mov_b32_e32 v1, s4
-; GFX11-NEXT: s_lshl_b32 s5, s5, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5
-; GFX11-NEXT: s_not_b32 s7, s6
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen
-; GFX11-NEXT: s_mov_b32 s6, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-NEXT: s_and_b32 s4, s16, 3
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-NEXT: s_not_b32 s6, s5
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_min_f32_e32 v0, v0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, s5, v0
-; GFX11-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v3
-; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB14_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: s_and_b32 s5, s20, 3
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: s_lshl_b32 s5, s5, 3
-; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5
-; GFX10-NEXT: s_not_b32 s7, s6
-; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
-; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-NEXT: s_and_b32 s4, s20, 3
+; GFX10-NEXT: s_lshl_b32 s4, s4, 3
+; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX10-NEXT: s_not_b32 s6, s5
+; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_mov_b32_e32 v5, s4
+; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX10-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: v_mov_b32_e32 v3, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
+; GFX10-NEXT: v_mov_b32_e32 v4, v0
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v3
-; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB14_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
-; GFX90A-NEXT: s_and_b32 s6, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX90A-NEXT: s_and_b32 s4, s20, -4
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
-; GFX90A-NEXT: s_lshl_b32 s7, s4, 3
-; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX90A-NEXT: s_not_b32 s8, s4
+; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
+; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX90A-NEXT: s_movk_i32 s9, 0x7fff
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3
; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s9
+; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
@@ -4538,37 +4519,36 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
-; GFX908-NEXT: s_and_b32 s6, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v1, s6
-; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX908-NEXT: s_and_b32 s4, s20, -4
+; GFX908-NEXT: v_mov_b32_e32 v2, s4
+; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
-; GFX908-NEXT: s_lshl_b32 s7, s4, 3
-; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX908-NEXT: s_not_b32 s8, s4
+; GFX908-NEXT: s_lshl_b32 s6, s4, 3
+; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX908-NEXT: s_movk_i32 s9, 0x7fff
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX908-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v3, v3, v0, s9
+; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0
-; GFX908-NEXT: v_mov_b32_e32 v4, v1
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v3, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v3
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB14_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4579,39 +4559,38 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
-; GFX8-NEXT: s_and_b32 s6, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v1, s6
-; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX8-NEXT: s_and_b32 s4, s20, -4
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
-; GFX8-NEXT: s_lshl_b32 s7, s4, 3
-; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7
-; GFX8-NEXT: s_not_b32 s8, s4
+; GFX8-NEXT: s_lshl_b32 s6, s4, 3
+; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f32_e32 v4, v4, v2
-; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f32_e32 v5, v5, v3
+; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v3, s8, v1
-; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, s7, v1
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB14_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4622,36 +4601,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
-; GFX7-NEXT: s_and_b32 s6, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v1, s6
-; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX7-NEXT: s_and_b32 s4, s20, -4
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX7-NEXT: s_and_b32 s4, s20, 3
-; GFX7-NEXT: s_lshl_b32 s7, s4, 3
-; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7
+; GFX7-NEXT: s_lshl_b32 s6, s4, 3
+; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: s_not_b32 s8, s4
+; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v3, s8, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mov_b32_e32 v5, s6
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX7-NEXT: v_and_b32_e32 v4, s7, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_mov_b32_e32 v4, v0
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v3
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB14_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4662,37 +4640,36 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
-; GFX6-NEXT: s_and_b32 s6, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v1, s6
-; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen
+; GFX6-NEXT: s_and_b32 s4, s20, -4
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX6-NEXT: s_and_b32 s4, s20, 3
-; GFX6-NEXT: s_lshl_b32 s7, s4, 3
-; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7
+; GFX6-NEXT: s_lshl_b32 s6, s4, 3
+; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: s_not_b32 s8, s4
+; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_min_f32_e32 v0, v0, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v3, s8, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v5, s6
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX6-NEXT: v_and_b32_e32 v4, s7, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
+; GFX6-NEXT: v_mov_b32_e32 v4, v0
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v3
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB14_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5419,27 +5396,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v0
-; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s5, s16, 0x400
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v3, s4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1
; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
+; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5
-; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v2
+; GFX12-NEXT: v_mov_b32_e32 v5, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v1, v4, v4
-; GFX12-NEXT: v_pk_min_num_f16 v3, v1, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5
+; GFX12-NEXT: v_pk_min_num_f16 v4, v0, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -5452,20 +5431,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: v_mov_b32_e32 v0, s16
; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s6, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_pk_max_f16 v2, v1, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, s6
; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: v_pk_max_f16 v1, v2, v2
; GFX940-NEXT: v_pk_max_f16 v0, v5, v5
-; GFX940-NEXT: v_mov_b32_e32 v3, s6
-; GFX940-NEXT: v_pk_min_f16 v4, v0, v1
; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_pk_min_f16 v4, v0, v2
+; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -5481,27 +5461,28 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x400
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-NEXT: v_pk_max_f16 v2, v1, v1
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT: v_pk_max_f16 v0, v2, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v1, v4, v4
-; GFX11-NEXT: v_pk_min_f16 v3, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v0, v5, v5
+; GFX11-NEXT: v_pk_min_f16 v4, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -5513,27 +5494,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x400
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: v_pk_max_f16 v2, v1, v1
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: v_pk_max_f16 v0, v2, v2
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_pk_max_f16 v1, v4, v4
-; GFX10-NEXT: v_pk_min_f16 v3, v1, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_pk_max_f16 v0, v5, v5
+; GFX10-NEXT: v_pk_min_f16 v4, v0, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB16_1
@@ -5544,19 +5525,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2
; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_pk_min_f16 v4, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -5572,25 +5553,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v2, v1, v1
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_pk_max_f16 v1, v2, v2
-; GFX908-NEXT: v_pk_max_f16 v0, v4, v4
-; GFX908-NEXT: v_pk_min_f16 v3, v0, v1
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_pk_max_f16 v0, v5, v5
+; GFX908-NEXT: v_pk_min_f16 v4, v0, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB16_1
@@ -5601,29 +5582,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v3, v1, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v4, v4
-; GFX8-NEXT: v_min_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v0, v6, v0
-; GFX8-NEXT: v_or_b32_e32 v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v1, v6, v6
+; GFX8-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v1, v1, v3
+; GFX8-NEXT: v_or_b32_e32 v5, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, v5
+; GFX8-NEXT: v_mov_b32_e32 v1, v6
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB16_1
@@ -5646,30 +5627,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_mov_b32_e32 v8, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_min_f32_e32 v4, v4, v2
-; GFX7-NEXT: v_min_f32_e32 v5, v5, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5
-; GFX7-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT: v_or_b32_e32 v4, v6, v0
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v0
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB16_1
@@ -5692,31 +5673,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_mov_b32_e32 v8, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_min_f32_e32 v4, v4, v2
-; GFX6-NEXT: v_min_f32_e32 v5, v5, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX6-NEXT: v_min_f32_e32 v6, v6, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5
-; GFX6-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
-; GFX6-NEXT: v_or_b32_e32 v4, v6, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6
+; GFX6-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX6-NEXT: v_or_b32_e32 v5, v7, v0
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB16_1
@@ -5738,25 +5719,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s16
-; GFX12-NEXT: s_add_co_i32 s5, s16, 0x400
+; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
+; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v3, s4
; GFX12-NEXT: s_mov_b32 s4, 0
-; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v1, v0, v0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
+; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_pk_min_num_f16 v1, v3, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT: v_mov_b32_e32 v3, v1
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2
+; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -5770,24 +5752,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v1, s16
-; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s6, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX940-NEXT: v_mov_b32_e32 v3, s6
; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v1, v0, v0
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX940-NEXT: v_mov_b32_e32 v6, s6
-; GFX940-NEXT: v_pk_min_f16 v2, v2, v1
+; GFX940-NEXT: v_pk_max_f16 v0, v1, v1
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT: v_pk_min_f16 v0, v0, v2
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX940-NEXT: s_cbranch_execnz .LBB17_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5798,25 +5781,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s16
-; GFX11-NEXT: s_add_i32 s5, s16, 0x400
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX11-NEXT: v_mov_b32_e32 v3, s4
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v1, v0, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX11-NEXT: v_pk_max_f16 v0, v1, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_pk_min_f16 v1, v3, v1
-; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2
-; GFX11-NEXT: v_mov_b32_e32 v3, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_min_f16 v0, v0, v2
+; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v4
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -5829,25 +5812,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s5, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v1, v0, v0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: v_pk_max_f16 v0, v1, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_pk_min_f16 v1, v3, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_pk_min_f16 v0, v0, v2
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
+; GFX10-NEXT: v_mov_b32_e32 v4, v0
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB17_1
@@ -5859,23 +5842,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v1, v0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, s6
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5886,24 +5869,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v1, v0, v0
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX908-NEXT: v_pk_min_f16 v1, v3, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX908-NEXT: v_pk_min_f16 v0, v0, v2
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB17_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5914,28 +5897,28 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v4, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_min_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v3, v5, v4
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v1, v1
+; GFX8-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v3
+; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT: v_mov_b32_e32 v6, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB17_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5946,41 +5929,41 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_mov_b32_e32 v7, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_min_f32_e32 v4, v4, v0
-; GFX7-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_or_b32_e32 v4, v2, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v0
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_or_b32_e32 v5, v3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v6, v3
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB17_1
@@ -5992,42 +5975,42 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_mov_b32_e32 v7, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_min_f32_e32 v4, v4, v0
-; GFX6-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX6-NEXT: v_or_b32_e32 v4, v2, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_min_f32_e32 v5, v5, v0
+; GFX6-NEXT: v_min_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT: v_or_b32_e32 v5, v3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v6, v3
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB17_1
@@ -6048,7 +6031,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
+; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX12-NEXT: s_mov_b32 s1, exec_lo
; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -6064,26 +6047,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024
+; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Loop Header: Depth=1
; GFX12-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX12-NEXT: v_pk_max_num_f16 v4, v5, v5
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v6, v8, v8
+; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_min_num_f16 v7, v6, v4
-; GFX12-NEXT: v_mov_b32_e32 v6, v7
-; GFX12-NEXT: v_mov_b32_e32 v7, v8
+; GFX12-NEXT: v_pk_min_num_f16 v5, v4, v8
+; GFX12-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-NEXT: v_mov_b32_e32 v5, v6
; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -6099,15 +6082,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX12-NEXT: v_mov_b32_e32 v8, v6
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6115,14 +6098,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_cbranch_execnz .LBB18_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v6
+; GFX12-NEXT: v_mov_b32_e32 v0, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4
+; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX940-NEXT: s_mov_b64 s[2:3], exec
; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -6134,23 +6117,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
+; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
; GFX940-NEXT: ; implicit-def: $vgpr4
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB18_1
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: s_mov_b64 exec, s[2:3]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_pk_max_f16 v9, v5, v5
; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Loop Header: Depth=1
; GFX940-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX940-NEXT: v_pk_max_f16 v4, v5, v5
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v6, v9, v9
+; GFX940-NEXT: v_pk_max_f16 v4, v7, v7
; GFX940-NEXT: s_mov_b64 s[8:9], exec
-; GFX940-NEXT: v_pk_min_f16 v8, v6, v4
+; GFX940-NEXT: v_pk_min_f16 v6, v4, v9
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -6163,27 +6146,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB18_4
; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX940-NEXT: s_mov_b64 exec, s[8:9]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v9, v6
+; GFX940-NEXT: v_mov_b32_e32 v7, v4
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB18_3
; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v0, v6
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
@@ -6197,25 +6180,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: ; implicit-def: $vgpr4
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB18_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: v_pk_max_f16 v8, v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX11-NEXT: v_pk_max_f16 v4, v5, v5
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v6, v8, v8
+; GFX11-NEXT: v_pk_max_f16 v4, v6, v6
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_min_f16 v7, v6, v4
-; GFX11-NEXT: v_mov_b32_e32 v6, v7
-; GFX11-NEXT: v_mov_b32_e32 v7, v8
+; GFX11-NEXT: v_pk_min_f16 v5, v4, v8
+; GFX11-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-NEXT: v_mov_b32_e32 v5, v6
; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
@@ -6229,14 +6212,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB18_4
; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX11-NEXT: v_mov_b32_e32 v8, v6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -6245,13 +6228,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_cbranch_execnz .LBB18_3
; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v6
+; GFX11-NEXT: v_mov_b32_e32 v0, v4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
@@ -6263,24 +6246,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
; GFX10-NEXT: ; implicit-def: $vgpr4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB18_1
; GFX10-NEXT: ; %bb.2:
; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: v_pk_max_f16 v8, v5, v5
; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX10-NEXT: v_pk_max_f16 v4, v5, v5
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v6, v8, v8
+; GFX10-NEXT: v_pk_max_f16 v4, v6, v6
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_pk_min_f16 v7, v6, v4
-; GFX10-NEXT: v_mov_b32_e32 v6, v7
-; GFX10-NEXT: v_mov_b32_e32 v7, v8
+; GFX10-NEXT: v_pk_min_f16 v5, v4, v8
+; GFX10-NEXT: v_mov_b32_e32 v4, v5
+; GFX10-NEXT: v_mov_b32_e32 v5, v6
; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
@@ -6292,15 +6275,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB18_4
; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX10-NEXT: v_mov_b32_e32 v8, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
@@ -6309,13 +6292,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX10-NEXT: s_cbranch_execnz .LBB18_3
; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_mov_b32_e32 v0, v6
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4
+; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -6327,22 +6310,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: ; implicit-def: $vgpr4
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5
; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v6, v9, v9
-; GFX90A-NEXT: v_pk_min_f16 v8, v6, v4
+; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7
+; GFX90A-NEXT: v_pk_min_f16 v6, v4, v9
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -6354,27 +6337,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4
+; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -6386,23 +6369,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
; GFX908-NEXT: ; implicit-def: $vgpr4
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB18_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_pk_max_f16 v8, v5, v5
; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX908-NEXT: v_pk_max_f16 v4, v5, v5
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v6, v8, v8
-; GFX908-NEXT: v_pk_min_f16 v7, v6, v4
-; GFX908-NEXT: v_mov_b32_e32 v6, v7
+; GFX908-NEXT: v_pk_max_f16 v4, v6, v6
+; GFX908-NEXT: v_pk_min_f16 v5, v4, v8
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
-; GFX908-NEXT: v_mov_b32_e32 v7, v8
+; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -6414,27 +6397,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB18_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v8, v6
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB18_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v0, v6
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -6446,27 +6429,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
; GFX8-NEXT: ; implicit-def: $vgpr4
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB18_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v9, v5, v5
; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX8-NEXT: v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v6, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v6, v5, v5
-; GFX8-NEXT: v_max_f16_e32 v7, v8, v8
-; GFX8-NEXT: v_min_f16_e32 v6, v7, v6
-; GFX8-NEXT: v_or_b32_e32 v7, v6, v4
-; GFX8-NEXT: v_mov_b32_e32 v6, v7
+; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT: v_min_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v9
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
-; GFX8-NEXT: v_mov_b32_e32 v7, v8
+; GFX8-NEXT: v_mov_b32_e32 v5, v6
; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -6478,21 +6461,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB18_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v8, v6
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB18_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -6669,43 +6652,45 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v0
-; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s6, s16, 0x400
+; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2
+; GFX12-NEXT: v_mov_b32_e32 v6, v0
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX12-NEXT: v_min_num_f32_e32 v1, v1, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-NEXT: v_min_num_f32_e32 v1, v5, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX12-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_min_num_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7
-; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v5, s6
-; GFX12-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4
-; GFX12-NEXT: v_perm_b32 v3, v1, v0, 0x7060302
+; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -6718,41 +6703,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: v_mov_b32_e32 v0, s16
; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX940-NEXT: s_add_i32 s8, s16, 0x400
+; GFX940-NEXT: s_add_i32 s4, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[6:7], 0
-; GFX940-NEXT: s_movk_i32 s9, 0x7fff
-; GFX940-NEXT: s_mov_b32 s10, 0x7060302
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX940-NEXT: s_mov_b32 s9, 0x7060302
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX940-NEXT: v_min_f32_e32 v1, v4, v1
-; GFX940-NEXT: v_min_f32_e32 v0, v6, v0
-; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT: v_add3_u32 v7, v7, v0, s9
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, s8
-; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5]
-; GFX940-NEXT: v_perm_b32 v4, v1, v0, s10
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
+; GFX940-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7]
+; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX940-NEXT: s_cbranch_execnz .LBB19_1
@@ -6763,45 +6748,47 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, s16
-; GFX11-NEXT: s_add_i32 s6, s16, 0x400
+; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2
+; GFX11-NEXT: v_mov_b32_e32 v6, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT: v_min_f32_e32 v1, v1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_min_f32_e32 v1, v5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_min_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT: v_mov_b32_e32 v5, s6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4
+; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v1, v0, 0x7060302
-; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -6814,41 +6801,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
-; GFX10-NEXT: s_add_i32 s6, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX10-NEXT: v_min_f32_e32 v0, v3, v0
-; GFX10-NEXT: v_min_f32_e32 v1, v5, v1
-; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX10-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4
-; GFX10-NEXT: v_mov_b32_e32 v5, s6
-; GFX10-NEXT: v_perm_b32 v3, v1, v0, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v0, v5
+; GFX10-NEXT: v_mov_b32_e32 v1, v6
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB19_1
@@ -6859,40 +6846,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT: s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_min_f32_e32 v1, v4, v1
-; GFX90A-NEXT: v_min_f32_e32 v0, v6, v0
-; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s9
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v1, v0, s10
-; GFX90A-NEXT: v_mov_b32_e32 v3, s8
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
+; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB19_1
@@ -6903,41 +6890,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s8, s20, 0x400
+; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: s_movk_i32 s9, 0x7fff
-; GFX908-NEXT: s_mov_b32 s10, 0x7060302
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT: v_min_f32_e32 v1, v3, v1
-; GFX908-NEXT: v_min_f32_e32 v0, v6, v0
-; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v3, v3, v1, s9
-; GFX908-NEXT: v_add3_u32 v7, v7, v0, s9
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v3, v1, v0, s10
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s8
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX908-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX908-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9
+; GFX908-NEXT: v_mov_b32_e32 v0, v5
+; GFX908-NEXT: v_mov_b32_e32 v1, v6
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB19_1
@@ -6948,42 +6935,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s8, s20, 0x400
+; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT: v_min_f32_e32 v1, v3, v1
-; GFX8-NEXT: v_min_f32_e32 v0, v6, v0
-; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v3, v1, v0, 16
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s8
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16
+; GFX8-NEXT: v_mov_b32_e32 v0, v5
+; GFX8-NEXT: v_mov_b32_e32 v1, v6
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB19_1
@@ -6995,38 +6982,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_min_f32_e32 v4, v4, v2
-; GFX7-NEXT: v_min_f32_e32 v5, v5, v3
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v3
; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v4
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT: v_mov_b32_e32 v6, v1
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB19_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7037,39 +7024,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_min_f32_e32 v4, v4, v2
-; GFX6-NEXT: v_min_f32_e32 v5, v5, v3
+; GFX6-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX6-NEXT: v_min_f32_e32 v6, v6, v3
; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v4
-; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v6, s6
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX6-NEXT: v_mov_b32_e32 v6, v1
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB19_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7089,43 +7076,41 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v1, s16
-; GFX12-NEXT: s_add_co_i32 s6, s16, 0x400
+; GFX12-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v3 :: v_dual_min_num_f32 v0, v0, v2
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v1, v3, v1
-; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s4, v1, v1
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v5, s6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v1, v3, v1, 0x7060302
-; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX12-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -7139,40 +7124,40 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v1, s16
-; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
-; GFX940-NEXT: s_add_i32 s8, s16, 0x400
+; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT: s_add_i32 s4, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[6:7], 0
-; GFX940-NEXT: s_movk_i32 s9, 0x7fff
-; GFX940-NEXT: s_mov_b32 s10, 0x7060302
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX940-NEXT: s_mov_b32 s9, 0x7060302
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX940-NEXT: v_min_f32_e32 v1, v2, v1
-; GFX940-NEXT: v_min_f32_e32 v2, v5, v4
-; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s9
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX940-NEXT: v_mov_b32_e32 v6, s8
-; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX940-NEXT: v_perm_b32 v2, v2, v1, s10
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX940-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_min_f32_e32 v5, v5, v3
+; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
+; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX940-NEXT: s_cbranch_execnz .LBB20_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7182,45 +7167,43 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s16
-; GFX11-NEXT: s_add_i32 s6, s16, 0x400
+; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_min_f32 v5, v5, v3 :: v_dual_min_f32 v0, v0, v2
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v1, v3, v1
-; GFX11-NEXT: v_min_f32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s4, v1, v1
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX11-NEXT: v_mov_b32_e32 v5, s6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302
-; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v5
; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -7234,39 +7217,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
-; GFX10-NEXT: s_add_i32 s6, s20, 0x400
+; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_min_f32_e32 v1, v3, v1
-; GFX10-NEXT: v_min_f32_e32 v3, v5, v4
-; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo
-; GFX10-NEXT: v_mov_b32_e32 v5, s6
-; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_min_f32_e32 v5, v5, v3
+; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v6, v1
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
+; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB20_1
@@ -7278,39 +7261,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT: s_add_i32 s8, s20, 0x400
+; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: s_movk_i32 s9, 0x7fff
-; GFX90A-NEXT: s_mov_b32 s10, 0x7060302
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT: v_min_f32_e32 v1, v2, v1
-; GFX90A-NEXT: v_min_f32_e32 v2, v5, v4
-; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10
-; GFX90A-NEXT: v_mov_b32_e32 v6, s8
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_min_f32_e32 v5, v5, v3
+; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7321,40 +7304,40 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s8, s20, 0x400
+; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: s_movk_i32 s9, 0x7fff
-; GFX908-NEXT: s_mov_b32 s10, 0x7060302
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX908-NEXT: v_min_f32_e32 v1, v3, v1
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v4
-; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9
-; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v6, s8
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX908-NEXT: v_min_f32_e32 v5, v5, v3
+; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX908-NEXT: v_mov_b32_e32 v6, v1
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB20_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7365,41 +7348,41 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s8, s20, 0x400
+; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT: v_min_f32_e32 v1, v3, v1
-; GFX8-NEXT: v_min_f32_e32 v3, v5, v4
-; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v6, s8
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_min_f32_e32 v5, v5, v3
+; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT: v_mov_b32_e32 v6, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB20_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7411,37 +7394,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_min_f32_e32 v4, v4, v0
-; GFX7-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v5, 16
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v0
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_alignbit_b32 v4, v4, v3, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB20_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7453,38 +7436,38 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_min_f32_e32 v4, v4, v0
-; GFX6-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX6-NEXT: v_alignbit_b32 v3, v3, v2, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v5, 16
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_min_f32_e32 v5, v5, v0
+; GFX6-NEXT: v_min_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_alignbit_b32 v4, v4, v3, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v6, s6
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB20_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7504,7 +7487,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
+; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX12-NEXT: s_mov_b32 s1, exec_lo
; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -7520,43 +7503,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024
+; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Loop Header: Depth=1
; GFX12-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v8
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX12-NEXT: v_and_b32_e32 v10, 0xffff0000, v8
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_min_num_f32_e32 v4, v6, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v6, v10, v7
-; GFX12-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v9 :: v_dual_min_num_f32 v4, v4, v8
+; GFX12-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v4
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
-; GFX12-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo
+; GFX12-NEXT: v_or_b32_e32 v13, 0x400000, v5
+; GFX12-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
+; GFX12-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v7, v6, v4, 0x7060302
-; GFX12-NEXT: v_mov_b32_e32 v6, v7
-; GFX12-NEXT: v_mov_b32_e32 v7, v8
+; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
+; GFX12-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-NEXT: v_mov_b32_e32 v5, v6
; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -7572,15 +7554,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX12-NEXT: v_mov_b32_e32 v8, v6
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -7588,14 +7570,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_cbranch_execnz .LBB21_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v6
+; GFX12-NEXT: v_mov_b32_e32 v0, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4
+; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX940-NEXT: s_mov_b64 s[2:3], exec
; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -7607,40 +7589,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
+; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
; GFX940-NEXT: ; implicit-def: $vgpr4
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB21_1
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: s_mov_b64 exec, s[2:3]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5
; GFX940-NEXT: s_movk_i32 s10, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX940-NEXT: s_mov_b32 s11, 0x7060302
; GFX940-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Loop Header: Depth=1
; GFX940-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v9
-; GFX940-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s10
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX940-NEXT: v_min_f32_e32 v4, v4, v9
+; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10
+; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX940-NEXT: s_mov_b64 s[8:9], exec
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
-; GFX940-NEXT: v_min_f32_e32 v6, v7, v6
-; GFX940-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX940-NEXT: v_add3_u32 v7, v7, v6, s10
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_min_f32_e32 v5, v5, v10
+; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10
+; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX940-NEXT: v_perm_b32 v8, v6, v4, s11
-; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
+; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX940-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -7653,27 +7635,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB21_4
; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX940-NEXT: s_mov_b64 exec, s[8:9]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v9, v6
+; GFX940-NEXT: v_mov_b32_e32 v7, v4
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB21_3
; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v0, v6
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -7687,41 +7669,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: ; implicit-def: $vgpr4
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB21_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v8
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v8
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v6, v10, v7
-; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_min_f32 v5, v5, v9 :: v_dual_min_f32 v4, v4, v8
+; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo
+; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5
+; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v7, v6, v4, 0x7060302
-; GFX11-NEXT: v_mov_b32_e32 v6, v7
-; GFX11-NEXT: v_mov_b32_e32 v7, v8
+; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
+; GFX11-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-NEXT: v_mov_b32_e32 v5, v6
; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
@@ -7735,14 +7718,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB21_4
; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX11-NEXT: v_mov_b32_e32 v8, v6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -7750,14 +7733,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: s_cbranch_execnz .LBB21_3
; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v6
+; GFX11-NEXT: v_mov_b32_e32 v0, v4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -7769,38 +7753,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
; GFX10-NEXT: ; implicit-def: $vgpr4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB21_1
; GFX10-NEXT: ; %bb.2:
; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX10-NEXT: v_min_f32_e32 v6, v10, v7
-; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX10-NEXT: v_min_f32_e32 v4, v4, v8
+; GFX10-NEXT: v_min_f32_e32 v5, v5, v9
+; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v6
-; GFX10-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
-; GFX10-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo
-; GFX10-NEXT: v_perm_b32 v7, v6, v4, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v6, v7
-; GFX10-NEXT: v_mov_b32_e32 v7, v8
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5
+; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v4, v5
+; GFX10-NEXT: v_mov_b32_e32 v5, v6
; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
@@ -7812,15 +7796,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB21_4
; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX10-NEXT: v_mov_b32_e32 v8, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
@@ -7829,13 +7813,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX10-NEXT: s_cbranch_execnz .LBB21_3
; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_mov_b32_e32 v0, v6
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4
+; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -7847,38 +7831,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: ; implicit-def: $vgpr4
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5
; GFX90A-NEXT: s_movk_i32 s14, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX90A-NEXT: s_mov_b32 s15, 0x7060302
; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v9
-; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s14
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX90A-NEXT: v_min_f32_e32 v4, v4, v9
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
-; GFX90A-NEXT: v_min_f32_e32 v6, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX90A-NEXT: v_add3_u32 v7, v7, v6, s14
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v8, v6, v4, s15
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_min_f32_e32 v5, v5, v10
+; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14
+; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -7890,27 +7874,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4
+; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -7922,39 +7906,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
; GFX908-NEXT: ; implicit-def: $vgpr4
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB21_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX908-NEXT: s_movk_i32 s14, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX908-NEXT: s_mov_b32 s15, 0x7060302
; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v8
-; GFX908-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX908-NEXT: v_add3_u32 v6, v6, v4, s14
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX908-NEXT: v_min_f32_e32 v4, v4, v8
+; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
-; GFX908-NEXT: v_min_f32_e32 v6, v7, v6
-; GFX908-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX908-NEXT: v_add3_u32 v7, v7, v6, s14
-; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX908-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc
-; GFX908-NEXT: v_perm_b32 v7, v6, v4, s15
-; GFX908-NEXT: v_mov_b32_e32 v6, v7
+; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_min_f32_e32 v5, v5, v9
+; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14
+; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
-; GFX908-NEXT: v_mov_b32_e32 v7, v8
+; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -7966,27 +7950,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB21_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v8, v6
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB21_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v0, v6
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -7998,40 +7982,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
; GFX8-NEXT: ; implicit-def: $vgpr4
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB21_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v8
-; GFX8-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT: v_min_f32_e32 v4, v4, v8
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
-; GFX8-NEXT: v_min_f32_e32 v6, v7, v6
-; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v7, v6, v4, 16
-; GFX8-NEXT: v_mov_b32_e32 v6, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_min_f32_e32 v5, v5, v9
+; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
+; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
-; GFX8-NEXT: v_mov_b32_e32 v7, v8
+; GFX8-NEXT: v_mov_b32_e32 v5, v6
; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -8043,21 +8027,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB21_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v8, v6
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB21_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -8236,19 +8220,19 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v0
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: v_mov_b32_e32 v0, s16
; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s6, s16, 0x400
; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, s6
; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v0
-; GFX940-NEXT: v_max_f32_e32 v1, v2, v2
; GFX940-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v0, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, s6
+; GFX940-NEXT: v_min_f32_e32 v4, v0, v2
; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
@@ -8287,19 +8271,19 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
@@ -8317,25 +8301,25 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v0, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v0, v1
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
-; GFX908-NEXT: v_mov_b32_e32 v5, s6
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT: v_min_f32_e32 v4, v0, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB22_1
@@ -8352,19 +8336,19 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4
-; GFX8-NEXT: v_min_f32_e32 v3, v0, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT: v_min_f32_e32 v4, v0, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB22_1
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
index 28a2245406842c..3216e71e6221ae 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
@@ -245,14 +245,14 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) {
; GISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v1
; GISEL-ASM-NEXT: ; %bb.2: ; %finallyendcf.split
; GISEL-ASM-NEXT: s_or_b64 exec, exec, s[6:7]
-; GISEL-ASM-NEXT: s_mov_b64 s[6:7], 0
; GISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base
+; GISEL-ASM-NEXT: s_mov_b64 s[6:7], 0
+; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s9
; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
; GISEL-ASM-NEXT: .LBB7_3: ; %finally
; GISEL-ASM-NEXT: ; =>This Inner Loop Header: Depth=1
-; GISEL-ASM-NEXT: s_and_b64 s[10:11], exec, s[4:5]
-; GISEL-ASM-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
-; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s9
+; GISEL-ASM-NEXT: s_and_b64 s[8:9], exec, s[4:5]
+; GISEL-ASM-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7]
; GISEL-ASM-NEXT: flat_store_dword v[0:1], v2
; GISEL-ASM-NEXT: s_waitcnt vmcnt(0)
; GISEL-ASM-NEXT: s_andn2_b64 exec, exec, s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
index a8ddece6117839..9104dc68eb9b49 100644
--- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
+++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
@@ -6,90 +6,95 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_mov_b64 s[26:27], s[2:3]
; CHECK-NEXT: s_mov_b64 s[24:25], s[0:1]
-; CHECK-NEXT: s_load_dword s4, s[8:9], 0x0
+; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT: s_load_dword s12, s[8:9], 0x4
+; CHECK-NEXT: s_load_dword s6, s[8:9], 0x4
; CHECK-NEXT: s_add_u32 s24, s24, s15
; CHECK-NEXT: s_addc_u32 s25, s25, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_bitcmp1_b32 s4, 0
+; CHECK-NEXT: s_bitcmp1_b32 s2, 0
+; CHECK-NEXT: s_cselect_b64 s[16:17], -1, 0
+; CHECK-NEXT: s_bitcmp1_b32 s2, 8
+; CHECK-NEXT: s_cselect_b64 s[10:11], -1, 0
+; CHECK-NEXT: s_bitcmp1_b32 s2, 16
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT: s_bitcmp1_b32 s4, 8
-; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT: s_bitcmp1_b32 s4, 16
-; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
; CHECK-NEXT: s_bitcmp1_b32 s0, 24
-; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0
-; CHECK-NEXT: s_xor_b64 s[14:15], s[4:5], -1
+; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
+; CHECK-NEXT: s_xor_b64 s[4:5], s[8:9], -1
; CHECK-NEXT: s_bitcmp1_b32 s1, 0
-; CHECK-NEXT: s_cselect_b64 s[10:11], -1, 0
-; CHECK-NEXT: s_bitcmp1_b32 s12, 8
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; CHECK-NEXT: s_cselect_b64 s[12:13], -1, 0
-; CHECK-NEXT: s_and_b64 s[0:1], exec, s[14:15]
+; CHECK-NEXT: s_bitcmp1_b32 s6, 8
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[16:17]
+; CHECK-NEXT: s_cselect_b64 s[14:15], -1, 0
+; CHECK-NEXT: s_and_b64 s[4:5], exec, s[4:5]
+; CHECK-NEXT: s_and_b64 s[6:7], exec, s[10:11]
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_branch .LBB0_3
; CHECK-NEXT: .LBB0_1: ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: s_mov_b64 s[16:17], 0
-; CHECK-NEXT: s_mov_b64 s[18:19], -1
-; CHECK-NEXT: s_mov_b64 s[14:15], -1
+; CHECK-NEXT: s_mov_b64 s[18:19], 0
; CHECK-NEXT: s_mov_b64 s[20:21], -1
+; CHECK-NEXT: s_mov_b64 s[16:17], -1
+; CHECK-NEXT: s_mov_b64 s[22:23], -1
; CHECK-NEXT: .LBB0_2: ; %Flow7
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: s_and_b64 vcc, exec, s[20:21]
+; CHECK-NEXT: s_and_b64 vcc, exec, s[22:23]
; CHECK-NEXT: s_cbranch_vccnz .LBB0_12
; CHECK-NEXT: .LBB0_3: ; %bb7
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_andn2_b64 vcc, exec, s[8:9]
+; CHECK-NEXT: s_and_b64 vcc, exec, s[2:3]
; CHECK-NEXT: s_cbranch_vccnz .LBB0_1
; CHECK-NEXT: ; %bb.4: ; %bb8
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: s_mov_b64 vcc, s[0:1]
+; CHECK-NEXT: s_mov_b64 vcc, s[4:5]
; CHECK-NEXT: s_cbranch_vccz .LBB0_6
; CHECK-NEXT: ; %bb.5: ; %bb9
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: s_mov_b64 s[14:15], 0
-; CHECK-NEXT: s_mov_b64 s[16:17], -1
-; CHECK-NEXT: s_mov_b64 s[20:21], s[6:7]
+; CHECK-NEXT: s_mov_b64 s[16:17], 0
+; CHECK-NEXT: s_mov_b64 s[18:19], -1
+; CHECK-NEXT: s_mov_b64 s[22:23], s[10:11]
; CHECK-NEXT: s_cbranch_execz .LBB0_7
; CHECK-NEXT: s_branch .LBB0_8
; CHECK-NEXT: .LBB0_6: ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: s_mov_b64 s[14:15], -1
-; CHECK-NEXT: s_mov_b64 s[16:17], 0
-; CHECK-NEXT: s_mov_b64 s[20:21], 0
+; CHECK-NEXT: s_mov_b64 s[16:17], -1
+; CHECK-NEXT: s_mov_b64 s[18:19], 0
+; CHECK-NEXT: s_mov_b64 s[22:23], 0
; CHECK-NEXT: .LBB0_7: ; %bb10
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: s_mov_b64 s[16:17], -1
-; CHECK-NEXT: s_mov_b64 s[14:15], 0
-; CHECK-NEXT: s_mov_b64 s[20:21], s[12:13]
+; CHECK-NEXT: s_mov_b64 s[18:19], -1
+; CHECK-NEXT: s_mov_b64 s[16:17], 0
+; CHECK-NEXT: s_mov_b64 s[22:23], s[14:15]
; CHECK-NEXT: .LBB0_8: ; %Flow9
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: s_mov_b64 s[18:19], -1
-; CHECK-NEXT: s_andn2_b64 vcc, exec, s[20:21]
; CHECK-NEXT: s_mov_b64 s[20:21], -1
+; CHECK-NEXT: s_andn2_b64 vcc, exec, s[22:23]
+; CHECK-NEXT: s_mov_b64 s[22:23], -1
; CHECK-NEXT: s_cbranch_vccnz .LBB0_2
; CHECK-NEXT: ; %bb.9: ; %bb13
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: s_mov_b64 s[14:15], 0
-; CHECK-NEXT: s_and_b64 vcc, exec, s[6:7]
-; CHECK-NEXT: s_mov_b64 s[18:19], 0
+; CHECK-NEXT: s_mov_b64 s[16:17], 0
+; CHECK-NEXT: s_mov_b64 s[20:21], 0
+; CHECK-NEXT: s_mov_b64 vcc, s[6:7]
; CHECK-NEXT: s_cbranch_vccz .LBB0_11
; CHECK-NEXT: ; %bb.10: ; %bb16
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: s_mov_b64 s[18:19], -1
-; CHECK-NEXT: s_mov_b64 s[20:21], s[10:11]
+; CHECK-NEXT: s_mov_b64 s[20:21], -1
+; CHECK-NEXT: s_mov_b64 s[22:23], s[12:13]
; CHECK-NEXT: .LBB0_11: ; %Flow11
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: s_mov_b64 s[16:17], 0
+; CHECK-NEXT: s_mov_b64 s[18:19], 0
; CHECK-NEXT: s_branch .LBB0_2
; CHECK-NEXT: .LBB0_12: ; %loop.exit.guard6
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: s_xor_b64 s[20:21], s[18:19], -1
-; CHECK-NEXT: s_mov_b64 s[18:19], -1
-; CHECK-NEXT: s_and_b64 vcc, exec, s[20:21]
+; CHECK-NEXT: s_xor_b64 s[22:23], s[20:21], -1
+; CHECK-NEXT: s_mov_b64 s[20:21], -1
+; CHECK-NEXT: s_and_b64 vcc, exec, s[22:23]
; CHECK-NEXT: s_cbranch_vccz .LBB0_16
; CHECK-NEXT: ; %bb.13: ; %bb14
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
; CHECK-NEXT: s_cbranch_vccnz .LBB0_15
; CHECK-NEXT: ; %bb.14: ; %bb15
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
@@ -97,22 +102,22 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
; CHECK-NEXT: buffer_store_dword v0, off, s[24:27], 0
; CHECK-NEXT: .LBB0_15: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: s_mov_b64 s[18:19], 0
+; CHECK-NEXT: s_mov_b64 s[20:21], 0
; CHECK-NEXT: .LBB0_16: ; %Flow13
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: s_andn2_b64 vcc, exec, s[18:19]
+; CHECK-NEXT: s_andn2_b64 vcc, exec, s[20:21]
; CHECK-NEXT: s_cbranch_vccnz .LBB0_3
; CHECK-NEXT: ; %bb.17: ; %loop.exit.guard
-; CHECK-NEXT: s_and_b64 vcc, exec, s[14:15]
+; CHECK-NEXT: s_and_b64 vcc, exec, s[16:17]
; CHECK-NEXT: s_cbranch_vccnz .LBB0_22
; CHECK-NEXT: ; %bb.18: ; %loop.exit.guard5
-; CHECK-NEXT: s_and_b64 vcc, exec, s[16:17]
+; CHECK-NEXT: s_and_b64 vcc, exec, s[18:19]
; CHECK-NEXT: s_cbranch_vccnz .LBB0_23
; CHECK-NEXT: ; %bb.19: ; %bb17
-; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5]
+; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9]
; CHECK-NEXT: s_cbranch_vccz .LBB0_21
; CHECK-NEXT: ; %bb.20: ; %bb19
-; CHECK-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
; CHECK-NEXT: s_cbranch_vccz .LBB0_22
; CHECK-NEXT: .LBB0_21: ; %bb18
; CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index 4281a4b74cbaeb..ff48a3fc980187 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -16515,39 +16515,41 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB68_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16562,34 +16564,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB68_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB68_1
@@ -16603,33 +16605,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB68_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB68_1
@@ -16643,33 +16645,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB68_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB68_1
@@ -16683,34 +16685,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB68_1
@@ -16788,39 +16790,41 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16836,35 +16840,35 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT: v_add_f32_e32 v0, v6, v0
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB69_1
@@ -16877,33 +16881,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB69_1
@@ -16917,33 +16921,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB69_1
@@ -16959,34 +16963,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_add_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB69_1
@@ -17074,38 +17078,40 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: v_dual_add_f32 v0, v6, v0 :: v_dual_and_b32 v7, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v5, v7, v5
-; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -17120,35 +17126,35 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT: v_add_f32_e32 v0, v6, v0
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB70_1
@@ -17165,33 +17171,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX90A-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX90A-NEXT: v_add_f32_e32 v0, v7, v0
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB70_1
@@ -17208,33 +17214,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX908-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_add_f32_e32 v0, v7, v0
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v0, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
-; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9
+; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB70_1
@@ -17249,34 +17255,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_add_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB70_1
@@ -17353,41 +17359,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB71_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -17400,35 +17406,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB71_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB71_1
@@ -17439,36 +17445,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB71_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB71_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17478,36 +17484,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB71_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB71_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17517,37 +17523,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB71_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17620,41 +17626,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB72_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -17669,35 +17675,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: .LBB72_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB72_1
@@ -17708,36 +17714,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB72_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB72_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17747,36 +17753,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB72_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB72_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17788,37 +17794,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB72_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB72_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17901,41 +17907,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v4, v[3:4]
+; GFX11-NEXT: flat_load_b32 v3, v[3:4]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB73_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -17950,35 +17956,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: .LBB73_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB73_1
@@ -17995,28 +18001,28 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB73_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX90A-NEXT: v_add_f32_e32 v0, v3, v0
-; GFX90A-NEXT: v_add_f32_e32 v3, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -18038,28 +18044,28 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB73_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX908-NEXT: v_add_f32_e32 v0, v5, v0
-; GFX908-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9
; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -18077,37 +18083,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB73_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18185,39 +18191,41 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB74_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -18233,35 +18241,35 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB74_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT: v_add_f32_e32 v0, v6, v0
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB74_1
@@ -18274,35 +18282,35 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB74_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB74_1
@@ -18316,33 +18324,33 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB74_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB74_1
@@ -18358,34 +18366,34 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_add_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB74_1
@@ -18463,41 +18471,41 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX11-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB75_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -18512,35 +18520,35 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: .LBB75_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB75_1
@@ -18551,38 +18559,38 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB75_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB75_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18592,36 +18600,36 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX908-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB75_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB75_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18633,37 +18641,37 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB75_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18740,39 +18748,41 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB76_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -18787,34 +18797,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB76_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB76_1
@@ -18828,33 +18838,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB76_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB76_1
@@ -18868,33 +18878,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB76_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB76_1
@@ -18908,34 +18918,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB76_1
@@ -19012,41 +19022,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p
; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB77_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -19059,35 +19069,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p
; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB77_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB77_1
@@ -19098,36 +19108,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p
; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB77_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB77_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -19137,36 +19147,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p
; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB77_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB77_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -19176,37 +19186,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p
; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB77_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -19280,39 +19290,41 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -19327,34 +19339,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB78_1
@@ -19368,33 +19380,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB78_1
@@ -19408,33 +19420,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB78_1
@@ -19448,34 +19460,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB78_1
@@ -19552,41 +19564,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_
; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -19599,35 +19611,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_
; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB79_1
@@ -19638,36 +19650,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_
; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB79_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -19677,36 +19689,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_
; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB79_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -19716,37 +19728,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_
; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB79_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index 36d7201c6fe467..36aa73fbf8e92a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -31,13 +31,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -76,13 +76,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -100,13 +100,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -173,13 +173,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -220,13 +220,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -244,13 +244,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -329,18 +329,18 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX940-NEXT: s_mov_b32 s1, -1
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v1, v2, v2
; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v1, v0
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX940-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX940-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
+; GFX940-NEXT: v_max_f32_e32 v0, v3, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v0, v1
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB2_1
@@ -381,17 +381,17 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v0, v1
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB2_1
@@ -408,17 +408,17 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX908-NEXT: v_max_f32_e32 v0, v0, v5
-; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v6, v6
+; GFX908-NEXT: v_max_f32_e32 v5, v0, v1
+; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB2_1
@@ -482,21 +482,21 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB3_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -528,20 +528,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB3_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -551,20 +551,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB3_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -622,21 +622,21 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB4_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -670,20 +670,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB4_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -693,20 +693,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB4_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -773,23 +773,23 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX940-NEXT: s_movk_i32 s0, 0xf800
; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT: flat_load_dword v5, v[4:5]
+; GFX940-NEXT: flat_load_dword v3, v[4:5]
; GFX940-NEXT: s_mov_b32 s1, -1
; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB5_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -831,12 +831,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1
-; GFX90A-NEXT: v_max_f32_e32 v0, v3, v0
+; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -858,12 +858,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v0, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v1, v1
-; GFX908-NEXT: v_max_f32_e32 v0, v5, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT: v_max_f32_e32 v0, v0, v2
; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -935,13 +935,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -982,13 +982,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1008,13 +1008,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1084,21 +1084,21 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB7_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1132,22 +1132,22 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB7_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1157,20 +1157,20 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX908-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1235,13 +1235,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1259,14 +1259,15 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_max_f32 v3, v2, v2
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v5, v3
+; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1286,14 +1287,14 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX10-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v5, v3
+; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1313,13 +1314,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -1337,13 +1338,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1426,13 +1427,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1471,13 +1472,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -1495,13 +1496,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1572,13 +1573,13 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1617,13 +1618,13 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -1641,13 +1642,13 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1714,13 +1715,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1761,13 +1762,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -1785,13 +1786,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1870,18 +1871,18 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX940-NEXT: s_mov_b32 s1, -1
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v1, v2, v2
; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v1, v0
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX940-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX940-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
+; GFX940-NEXT: v_max_f32_e32 v0, v3, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v0, v1
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB12_1
@@ -1922,17 +1923,17 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v0, v1
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
@@ -1949,17 +1950,17 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX908-NEXT: v_max_f32_e32 v0, v0, v5
-; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v6, v6
+; GFX908-NEXT: v_max_f32_e32 v5, v0, v1
+; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB12_1
@@ -2023,21 +2024,21 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB13_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2069,20 +2070,20 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2092,20 +2093,20 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor
; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2163,21 +2164,21 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB14_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2211,20 +2212,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2234,20 +2235,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB14_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2314,23 +2315,23 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX940-NEXT: s_movk_i32 s0, 0xf800
; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT: flat_load_dword v5, v[4:5]
+; GFX940-NEXT: flat_load_dword v3, v[4:5]
; GFX940-NEXT: s_mov_b32 s1, -1
; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB15_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2372,12 +2373,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1
-; GFX90A-NEXT: v_max_f32_e32 v0, v3, v0
+; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -2399,12 +2400,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v0, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v1, v1
-; GFX908-NEXT: v_max_f32_e32 v0, v5, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT: v_max_f32_e32 v0, v0, v2
; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -2476,13 +2477,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2523,13 +2524,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2549,13 +2550,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -2625,21 +2626,21 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB17_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2673,22 +2674,22 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2698,20 +2699,20 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX908-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB17_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2769,29 +2770,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
-; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execz .LBB18_4
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
+; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -2800,25 +2801,24 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: ; %bb.3: ; %Flow
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX12-NEXT: .LBB18_4: ; %Flow2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX12-NEXT: s_cbranch_execz .LBB18_6
; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off
; GFX12-NEXT: .LBB18_6: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -2865,29 +2865,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execz .LBB18_4
; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -2895,23 +2895,22 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX11-NEXT: ; %bb.3: ; %Flow
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX11-NEXT: .LBB18_4: ; %Flow2
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB18_6
; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off
; GFX11-NEXT: .LBB18_6: ; %atomicrmw.phi
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -3001,91 +3000,90 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX908-NEXT: s_cbranch_execz .LBB18_4
; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT: v_mov_b32_e32 v9, v3
+; GFX908-NEXT: v_mov_b32_e32 v8, v2
+; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB18_2
; GFX908-NEXT: ; %bb.3: ; %Flow
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX908-NEXT: .LBB18_4: ; %Flow2
; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_cbranch_execz .LBB18_6
; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
-; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
; GFX908-NEXT: .LBB18_6: ; %atomicrmw.phi
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: v_mov_b32_e32 v0, v2
+; GFX908-NEXT: v_mov_b32_e32 v1, v3
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0
; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX8-NEXT: s_cbranch_execz .LBB18_4
; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v5, v[4:5]
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v3, v[2:3]
+; GFX8-NEXT: flat_load_dword v2, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB18_2
; GFX8-NEXT: ; %bb.3: ; %Flow
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX8-NEXT: .LBB18_4: ; %Flow2
; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_cbranch_execz .LBB18_6
@@ -3093,18 +3091,17 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6
-; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen
-; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen
; GFX8-NEXT: .LBB18_6: ; %atomicrmw.phi
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v4
-; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: v_mov_b32_e32 v0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, v3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -3162,6 +3159,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
@@ -3188,10 +3186,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[8:9]
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9]
+; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3210,7 +3207,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_cbranch_execz .LBB19_2
; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3266,6 +3262,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
@@ -3288,10 +3285,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9]
-; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3310,7 +3306,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX11-NEXT: s_cbranch_execz .LBB19_2
; GFX11-NEXT: .LBB19_6: ; %atomicrmw.private
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3408,6 +3403,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0
; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base
@@ -3430,9 +3426,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v9, v1
; GFX908-NEXT: v_mov_b32_e32 v8, v0
-; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7]
+; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -3448,7 +3443,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX908-NEXT: s_cbranch_execz .LBB19_2
; GFX908-NEXT: .LBB19_6: ; %atomicrmw.private
; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
@@ -3464,6 +3458,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0
; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0
@@ -3491,9 +3486,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v1
; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7]
+; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -3509,7 +3503,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX8-NEXT: s_cbranch_execz .LBB19_2
; GFX8-NEXT: .LBB19_6: ; %atomicrmw.private
; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6
; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
@@ -3578,6 +3571,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
@@ -3604,10 +3598,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[8:9]
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9]
+; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3626,7 +3619,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_cbranch_execz .LBB20_2
; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3683,6 +3675,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
@@ -3705,10 +3698,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9]
-; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3727,7 +3719,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX11-NEXT: s_cbranch_execz .LBB20_2
; GFX11-NEXT: .LBB20_6: ; %atomicrmw.private
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3825,6 +3816,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base
@@ -3847,9 +3839,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v9, v1
; GFX908-NEXT: v_mov_b32_e32 v8, v0
-; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7]
+; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -3865,7 +3856,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX908-NEXT: s_cbranch_execz .LBB20_2
; GFX908-NEXT: .LBB20_6: ; %atomicrmw.private
; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
@@ -3881,6 +3871,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0
; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0
@@ -3908,9 +3899,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v1
; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7]
+; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -3926,7 +3916,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX8-NEXT: s_cbranch_execz .LBB20_2
; GFX8-NEXT: .LBB20_6: ; %atomicrmw.private
; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6
; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
@@ -3995,6 +3984,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -4011,21 +4001,20 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global
-; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
+; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -4034,19 +4023,18 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: ; %bb.5: ; %Flow
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX12-NEXT: s_cbranch_execz .LBB21_2
; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
-; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off
+; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
+; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[6:7]
+; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -4092,6 +4080,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1
@@ -4104,22 +4093,21 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global
-; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1]
+; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1]
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -4127,19 +4115,18 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX11-NEXT: ; %bb.5: ; %Flow
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB21_2
; GFX11-NEXT: .LBB21_6: ; %atomicrmw.private
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off
+; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
+; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -4225,6 +4212,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -4237,40 +4225,38 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global
-; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v5, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v4, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB21_4
; GFX908-NEXT: ; %bb.5: ; %Flow
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_cbranch_execz .LBB21_2
; GFX908-NEXT: .LBB21_6: ; %atomicrmw.private
; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
@@ -4278,6 +4264,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0
; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -4292,44 +4279,42 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
; GFX8-NEXT: .LBB21_3: ; %atomicrmw.global
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[4:5]
-; GFX8-NEXT: flat_load_dword v6, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v5, v[2:3]
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB21_4
; GFX8-NEXT: ; %bb.5: ; %Flow
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_cbranch_execz .LBB21_2
; GFX8-NEXT: .LBB21_6: ; %atomicrmw.private
; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4
-; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7]
+; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -4385,13 +4370,14 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_3
; GFX12-NEXT: ; %bb.1: ; %Flow2
@@ -4404,21 +4390,20 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global
-; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
+; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7]
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -4426,20 +4411,19 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_cbranch_execnz .LBB22_4
; GFX12-NEXT: ; %bb.5: ; %Flow
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX12-NEXT: s_cbranch_execz .LBB22_2
; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private
-; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
-; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off
+; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -4487,12 +4471,13 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB22_3
; GFX11-NEXT: ; %bb.1: ; %Flow2
@@ -4502,42 +4487,40 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-NEXT: .LBB22_3: ; %atomicrmw.global
-; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1]
+; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7]
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: .LBB22_4: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: s_cbranch_execnz .LBB22_4
; GFX11-NEXT: ; %bb.5: ; %Flow
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB22_2
; GFX11-NEXT: .LBB22_6: ; %atomicrmw.private
-; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off
+; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -4627,10 +4610,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0x7f8, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB22_3
@@ -4641,40 +4625,38 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
; GFX908-NEXT: .LBB22_3: ; %atomicrmw.global
-; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: .LBB22_4: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB22_4
; GFX908-NEXT: ; %bb.5: ; %Flow
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_cbranch_execz .LBB22_2
; GFX908-NEXT: .LBB22_6: ; %atomicrmw.private
-; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
@@ -4682,12 +4664,13 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0
; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7f8, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
+; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB22_3
@@ -4698,44 +4681,42 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
; GFX8-NEXT: .LBB22_3: ; %atomicrmw.global
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[4:5]
-; GFX8-NEXT: flat_load_dword v6, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[6:7]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: .LBB22_4: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB22_4
; GFX8-NEXT: ; %bb.5: ; %Flow
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_cbranch_execz .LBB22_2
; GFX8-NEXT: .LBB22_6: ; %atomicrmw.private
-; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4
-; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -4794,13 +4775,14 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_3
; GFX12-NEXT: ; %bb.1: ; %Flow2
@@ -4813,21 +4795,20 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global
-; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
+; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7]
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -4835,20 +4816,19 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_cbranch_execnz .LBB23_4
; GFX12-NEXT: ; %bb.5: ; %Flow
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX12-NEXT: s_cbranch_execz .LBB23_2
; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private
-; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
-; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off
+; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -4897,12 +4877,13 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB23_3
; GFX11-NEXT: ; %bb.1: ; %Flow2
@@ -4912,42 +4893,40 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-NEXT: .LBB23_3: ; %atomicrmw.global
-; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1]
+; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7]
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: .LBB23_4: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: s_cbranch_execnz .LBB23_4
; GFX11-NEXT: ; %bb.5: ; %Flow
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB23_2
; GFX11-NEXT: .LBB23_6: ; %atomicrmw.private
-; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off
+; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -5037,10 +5016,11 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc
; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB23_3
@@ -5051,40 +5031,38 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
; GFX908-NEXT: .LBB23_3: ; %atomicrmw.global
-; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: .LBB23_4: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB23_4
; GFX908-NEXT: ; %bb.5: ; %Flow
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_cbranch_execz .LBB23_2
; GFX908-NEXT: .LBB23_6: ; %atomicrmw.private
-; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
@@ -5092,12 +5070,13 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0
; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
+; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB23_3
@@ -5108,44 +5087,42 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
; GFX8-NEXT: .LBB23_3: ; %atomicrmw.global
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[4:5]
-; GFX8-NEXT: flat_load_dword v6, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[6:7]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: .LBB23_4: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB23_4
; GFX8-NEXT: ; %bb.5: ; %Flow
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_cbranch_execz .LBB23_2
; GFX8-NEXT: .LBB23_6: ; %atomicrmw.private
-; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4
-; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -5204,29 +5181,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
-; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execz .LBB24_4
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
+; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -5235,25 +5212,24 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: ; %bb.3: ; %Flow
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX12-NEXT: .LBB24_4: ; %Flow2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX12-NEXT: s_cbranch_execz .LBB24_6
; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off
; GFX12-NEXT: .LBB24_6: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -5300,29 +5276,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execz .LBB24_4
; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: .LBB24_2: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -5330,78 +5306,76 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX11-NEXT: ; %bb.3: ; %Flow
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX11-NEXT: .LBB24_4: ; %Flow2
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB24_6
; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off
; GFX11-NEXT: .LBB24_6: ; %atomicrmw.phi
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1
; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4
; GFX10-NEXT: s_cbranch_execz .LBB24_4
; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB24_2: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX10-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX10-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v9, v3
+; GFX10-NEXT: v_mov_b32_e32 v8, v2
+; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB24_2
; GFX10-NEXT: ; %bb.3: ; %Flow
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX10-NEXT: .LBB24_4: ; %Flow2
; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4
; GFX10-NEXT: s_cbranch_execz .LBB24_6
; GFX10-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX10-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; GFX10-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX10-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
; GFX10-NEXT: .LBB24_6: ; %atomicrmw.phi
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-NEXT: v_mov_b32_e32 v1, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
@@ -5409,140 +5383,138 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB24_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5]
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB24_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB24_4: ; %Flow2
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB24_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX908-NEXT: s_cbranch_execz .LBB24_4
; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: .LBB24_2: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT: v_mov_b32_e32 v9, v3
+; GFX908-NEXT: v_mov_b32_e32 v8, v2
+; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB24_2
; GFX908-NEXT: ; %bb.3: ; %Flow
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX908-NEXT: .LBB24_4: ; %Flow2
; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_cbranch_execz .LBB24_6
; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
-; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
; GFX908-NEXT: .LBB24_6: ; %atomicrmw.phi
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: v_mov_b32_e32 v0, v2
+; GFX908-NEXT: v_mov_b32_e32 v1, v3
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0
; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX8-NEXT: s_cbranch_execz .LBB24_4
; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v5, v[4:5]
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v3, v[2:3]
+; GFX8-NEXT: flat_load_dword v2, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: .LBB24_2: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB24_2
; GFX8-NEXT: ; %bb.3: ; %Flow
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX8-NEXT: .LBB24_4: ; %Flow2
; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_cbranch_execz .LBB24_6
@@ -5550,18 +5522,17 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6
-; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen
-; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen
; GFX8-NEXT: .LBB24_6: ; %atomicrmw.phi
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v4
-; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: v_mov_b32_e32 v0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, v3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -5570,37 +5541,37 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX7-NEXT: s_cbranch_execz .LBB24_4
; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v5, v[4:5]
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v3, v[2:3]
+; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: .LBB24_2: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX7-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5]
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB24_2
; GFX7-NEXT: ; %bb.3: ; %Flow
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX7-NEXT: .LBB24_4: ; %Flow2
; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_cbranch_execz .LBB24_6
@@ -5608,18 +5579,17 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6
-; GFX7-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX7-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen
-; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX7-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen
; GFX7-NEXT: .LBB24_6: ; %atomicrmw.phi
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v4
-; GFX7-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
@@ -5634,29 +5604,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
-; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execz .LBB25_4
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
+; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -5665,25 +5635,24 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am
; GFX12-NEXT: ; %bb.3: ; %Flow
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX12-NEXT: .LBB25_4: ; %Flow2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX12-NEXT: s_cbranch_execz .LBB25_6
; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off
; GFX12-NEXT: .LBB25_6: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -5730,29 +5699,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execz .LBB25_4
; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: .LBB25_2: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -5760,23 +5729,22 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am
; GFX11-NEXT: ; %bb.3: ; %Flow
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX11-NEXT: .LBB25_4: ; %Flow2
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB25_6
; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off
; GFX11-NEXT: .LBB25_6: ; %atomicrmw.phi
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -5866,91 +5834,90 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX908-NEXT: s_cbranch_execz .LBB25_4
; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: .LBB25_2: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT: v_mov_b32_e32 v9, v3
+; GFX908-NEXT: v_mov_b32_e32 v8, v2
+; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB25_2
; GFX908-NEXT: ; %bb.3: ; %Flow
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX908-NEXT: .LBB25_4: ; %Flow2
; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_cbranch_execz .LBB25_6
; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
-; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
; GFX908-NEXT: .LBB25_6: ; %atomicrmw.phi
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: v_mov_b32_e32 v0, v2
+; GFX908-NEXT: v_mov_b32_e32 v1, v3
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0
; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX8-NEXT: s_cbranch_execz .LBB25_4
; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v5, v[4:5]
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v3, v[2:3]
+; GFX8-NEXT: flat_load_dword v2, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: .LBB25_2: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB25_2
; GFX8-NEXT: ; %bb.3: ; %Flow
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX8-NEXT: .LBB25_4: ; %Flow2
; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_cbranch_execz .LBB25_6
@@ -5958,18 +5925,17 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am
; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6
-; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen
-; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen
; GFX8-NEXT: .LBB25_6: ; %atomicrmw.phi
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v4
-; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: v_mov_b32_e32 v0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, v3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -6032,8 +5998,9 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
@@ -6045,12 +6012,11 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7
+; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -6083,14 +6049,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v4, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT: v_max_f16_e32 v5, v6, v5
+; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc1
@@ -6110,8 +6076,9 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
@@ -6123,12 +6090,11 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v7
+; GFX11-NEXT: v_max_f16_e32 v5, v5, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -6152,6 +6118,7 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
@@ -6163,10 +6130,9 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX10-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT: v_max_f16_e32 v5, v5, v7
+; GFX10-NEXT: v_max_f16_e32 v5, v5, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -6195,14 +6161,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v4, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
@@ -6229,14 +6195,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX908-NEXT: v_not_b32_e32 v4, v4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT: v_max_f16_e32 v5, v7, v5
+; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
@@ -6263,17 +6229,17 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT: v_max_f16_e32 v5, v7, v5
-; GFX8-NEXT: v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -6336,10 +6302,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6349,12 +6316,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7
+; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -6389,14 +6355,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v4, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT: v_max_f16_e32 v5, v6, v5
+; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc1
@@ -6417,10 +6383,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6430,12 +6397,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v7
+; GFX11-NEXT: v_max_f16_e32 v5, v5, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -6460,9 +6426,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: flat_load_dword v5, v[0:1]
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6471,10 +6438,9 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX10-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT: v_max_f16_e32 v5, v5, v7
+; GFX10-NEXT: v_max_f16_e32 v5, v5, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -6504,14 +6470,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v4, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
@@ -6539,14 +6505,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX908-NEXT: v_not_b32_e32 v4, v4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT: v_max_f16_e32 v5, v7, v5
+; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
@@ -6574,17 +6540,17 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT: v_max_f16_e32 v5, v7, v5
-; GFX8-NEXT: v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -6649,10 +6615,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6662,12 +6629,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7
+; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -6703,14 +6669,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v4, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT: v_max_f16_e32 v5, v6, v5
+; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc1
@@ -6731,10 +6697,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6744,12 +6711,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v7
+; GFX11-NEXT: v_max_f16_e32 v5, v5, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -6774,9 +6740,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: flat_load_dword v5, v[0:1]
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6785,10 +6752,9 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX10-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT: v_max_f16_e32 v5, v5, v7
+; GFX10-NEXT: v_max_f16_e32 v5, v5, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -6818,14 +6784,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v4, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
@@ -6853,14 +6819,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX908-NEXT: v_not_b32_e32 v4, v4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT: v_max_f16_e32 v5, v7, v5
+; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
@@ -6888,17 +6854,17 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT: v_max_f16_e32 v5, v7, v5
-; GFX8-NEXT: v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -6962,8 +6928,9 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: flat_load_b32 v4, v[0:1]
@@ -6975,10 +6942,9 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7
+; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
@@ -7012,13 +6978,13 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v6, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_max_f16_e32 v7, v2, v2
; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT: v_max_f16_e32 v4, v4, v7
+; GFX940-NEXT: v_max_f16_e32 v4, v4, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX940-NEXT: buffer_wbl2 sc1
@@ -7038,8 +7004,9 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: flat_load_b32 v4, v[0:1]
@@ -7051,10 +7018,9 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v7
+; GFX11-NEXT: v_max_f16_e32 v3, v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
@@ -7079,6 +7045,7 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
@@ -7090,9 +7057,8 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
; GFX10-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v7
+; GFX10-NEXT: v_max_f16_e32 v3, v3, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -7121,13 +7087,13 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2
; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
@@ -7154,13 +7120,13 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
; GFX908-NEXT: v_not_b32_e32 v6, v3
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_max_f16_e32 v7, v2, v2
; GFX908-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v7
+; GFX908-NEXT: v_max_f16_e32 v3, v3, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
@@ -7187,16 +7153,16 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
; GFX8-NEXT: v_not_b32_e32 v6, v3
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_max_f16_e32 v7, v2, v2
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v7
-; GFX8-NEXT: v_and_b32_e32 v8, v4, v6
+; GFX8-NEXT: v_max_f16_e32 v3, v3, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v8, v3
+; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -7255,36 +7221,36 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -7302,29 +7268,29 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v6, v2, v2
; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT: v_max_f16_e32 v4, v4, v7
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB30_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7334,37 +7300,37 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
+; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7376,31 +7342,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v5, v5
; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v7
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB30_1
@@ -7411,31 +7377,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB30_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7445,31 +7411,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v6, v2, v2
; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v7
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB30_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7479,32 +7445,32 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT: v_not_b32_e32 v5, v5
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v7
-; GFX8-NEXT: v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB30_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7559,36 +7525,36 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -7607,29 +7573,29 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v6, v2, v2
; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT: v_max_f16_e32 v4, v4, v7
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB31_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7639,37 +7605,37 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
+; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7681,31 +7647,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v5, v5
; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v7
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB31_1
@@ -7716,31 +7682,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB31_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7750,31 +7716,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v6, v2, v2
; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v7
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB31_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7784,32 +7750,32 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT: v_not_b32_e32 v5, v5
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v7
-; GFX8-NEXT: v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB31_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7865,15 +7831,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4
-; GFX12-NEXT: v_max_num_f16_e32 v3, v5, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4
+; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
@@ -7898,14 +7864,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: s_mov_b32 s2, 0xffff0000
; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f16_e32 v4, v2, v2
; GFX940-NEXT: v_max_f16_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f16_e32 v3, v3, v4
+; GFX940-NEXT: v_max_f16_e32 v3, v3, v2
; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
@@ -7924,15 +7890,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: v_max_f16_e32 v3, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX11-NEXT: v_max_f16_e32 v3, v5, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f16_e32 v3, v4, v4
+; GFX11-NEXT: v_max_f16_e32 v3, v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
@@ -7956,23 +7922,23 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_max_f16_e32 v1, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_max_f16_e32 v0, v2, v2
-; GFX10-NEXT: v_max_f16_e32 v5, v1, v1
-; GFX10-NEXT: v_max_f16_e32 v0, v5, v0
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_max_f16_e32 v0, v6, v6
+; GFX10-NEXT: v_max_f16_e32 v0, v0, v1
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX10-NEXT: v_and_or_b32 v5, 0xffff0000, v6, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB32_1
@@ -7985,14 +7951,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f16_e32 v3, v3, v4
+; GFX90A-NEXT: v_max_f16_e32 v3, v3, v2
; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8011,14 +7977,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: s_mov_b32 s6, 0xffff0000
; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f16_e32 v5, v2, v2
; GFX908-NEXT: v_max_f16_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v5
+; GFX908-NEXT: v_max_f16_e32 v3, v3, v2
; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8039,19 +8005,19 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v1, v2, v2
; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v0, v1, v1
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v5
-; GFX8-NEXT: v_or_b32_e32 v0, v6, v0
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_max_f16_e32 v0, v6, v6
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT: v_or_b32_e32 v5, v2, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB32_1
@@ -8101,24 +8067,24 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2046
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4
+; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v5, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v4
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -8132,23 +8098,23 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2046
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v4, v2, v2
; GFX940-NEXT: s_mov_b32 s2, 0xffff0000
; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f16_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f16_e32 v4, v5, v5
-; GFX940-NEXT: v_max_f16_e32 v3, v4, v3
-; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3
+; GFX940-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v4
+; GFX940-NEXT: v_and_or_b32 v2, v3, s2, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB33_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8158,25 +8124,25 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2046
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-NEXT: v_max_f16_e32 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_max_f16_e32 v3, v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e32 v5, v4, v4
+; GFX11-NEXT: v_max_f16_e32 v2, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v5, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v4
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -8190,23 +8156,23 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_max_f16_e32 v4, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_max_f16_e32 v3, v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX10-NEXT: v_max_f16_e32 v3, v5, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v4
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB33_1
@@ -8217,22 +8183,22 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2
; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f16_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f16_e32 v4, v5, v5
-; GFX90A-NEXT: v_max_f16_e32 v3, v4, v3
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
+; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v4
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB33_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8242,22 +8208,22 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2046
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v4, v2, v2
; GFX908-NEXT: s_mov_b32 s6, 0xffff0000
; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f16_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX908-NEXT: v_max_f16_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
+; GFX908-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v4
+; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB33_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8269,22 +8235,22 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v4, v2, v2
; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_e32 v3, v2, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT: v_max_f16_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v6, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v4
+; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB33_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8334,10 +8300,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -8347,12 +8314,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7
+; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -8388,14 +8354,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v4, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT: v_max_f16_e32 v5, v6, v5
+; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
@@ -8416,10 +8382,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -8429,12 +8396,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v7
+; GFX11-NEXT: v_max_f16_e32 v5, v5, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -8459,9 +8425,10 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: flat_load_dword v5, v[0:1]
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -8470,10 +8437,9 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX10-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT: v_max_f16_e32 v5, v5, v7
+; GFX10-NEXT: v_max_f16_e32 v5, v5, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -8503,14 +8469,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v4, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX90A-NEXT: buffer_wbl2
@@ -8540,14 +8506,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX908-NEXT: v_not_b32_e32 v4, v4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT: v_max_f16_e32 v5, v7, v5
+; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
@@ -8575,17 +8541,17 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT: v_max_f16_e32 v5, v7, v5
-; GFX8-NEXT: v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -8648,37 +8614,37 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -8696,29 +8662,29 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v6, v2, v2
; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT: v_max_f16_e32 v4, v4, v7
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB35_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8728,37 +8694,37 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
+; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -8770,31 +8736,31 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX10-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v5, v5
; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v7
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB35_1
@@ -8805,33 +8771,33 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX90A-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB35_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8841,31 +8807,31 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX908-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v6, v2, v2
; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v7
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB35_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8875,32 +8841,32 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX8-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT: v_not_b32_e32 v5, v5
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v7
-; GFX8-NEXT: v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB35_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12308,15 +12274,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -12338,14 +12304,14 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v2
; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: v_pk_max_f16 v3, v5, v5
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_max_f16 v4, v3, v4
+; GFX940-NEXT: v_pk_max_f16 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12363,15 +12329,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12391,14 +12357,14 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX10-NEXT: v_pk_max_f16 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12418,13 +12384,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4
+; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -12442,13 +12408,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v2
; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_pk_max_f16 v5, v2, v2
; GFX908-NEXT: v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT: v_pk_max_f16 v3, v3, v5
+; GFX908-NEXT: v_pk_max_f16 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -12466,21 +12432,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v3, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v6, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v3, v7, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v3, v5
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT: v_max_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB46_1
@@ -12544,15 +12510,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -12574,14 +12540,14 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v2
; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: v_pk_max_f16 v3, v5, v5
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_max_f16 v4, v3, v4
+; GFX940-NEXT: v_pk_max_f16 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12599,15 +12565,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12628,21 +12594,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_pk_max_f16 v1, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_pk_max_f16 v0, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v5, v1, v1
-; GFX10-NEXT: v_pk_max_f16 v0, v5, v0
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_pk_max_f16 v0, v6, v6
+; GFX10-NEXT: v_pk_max_f16 v5, v0, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB47_1
@@ -12655,13 +12621,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4
+; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -12679,13 +12645,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v2
; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_pk_max_f16 v5, v2, v2
; GFX908-NEXT: v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT: v_pk_max_f16 v3, v3, v5
+; GFX908-NEXT: v_pk_max_f16 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -12705,21 +12671,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v0, v7, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB47_1
@@ -12783,15 +12749,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -12821,19 +12787,19 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX940-NEXT: s_mov_b32 s1, -1
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v1, v2, v2
; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v1, v0
-; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX940-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
+; GFX940-NEXT: v_pk_max_f16 v0, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_max_f16 v0, v0, v3
+; GFX940-NEXT: v_pk_max_f16 v2, v0, v1
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB48_1
@@ -12852,20 +12818,20 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_pk_max_f16 v1, v2, v2
; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, v0
-; GFX11-NEXT: v_pk_max_f16 v0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v5, v1, v1
-; GFX11-NEXT: v_pk_max_f16 v0, v5, v0
+; GFX11-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v0, v6, v6
+; GFX11-NEXT: v_pk_max_f16 v5, v0, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12879,21 +12845,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_pk_max_f16 v1, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_pk_max_f16 v0, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v5, v1, v1
-; GFX10-NEXT: v_pk_max_f16 v0, v5, v0
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_pk_max_f16 v0, v6, v6
+; GFX10-NEXT: v_pk_max_f16 v5, v0, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB48_1
@@ -12910,17 +12876,17 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2
; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
-; GFX90A-NEXT: v_pk_max_f16 v0, v0, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_pk_max_f16 v0, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v0, v1
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB48_1
@@ -12937,17 +12903,17 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v1, v2, v2
; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: v_pk_max_f16 v5, v2, v2
-; GFX908-NEXT: v_pk_max_f16 v0, v1, v1
-; GFX908-NEXT: v_pk_max_f16 v0, v0, v5
-; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: v_pk_max_f16 v0, v6, v6
+; GFX908-NEXT: v_pk_max_f16 v5, v0, v1
+; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB48_1
@@ -12962,21 +12928,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v0, v7, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB48_1
@@ -13039,21 +13005,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -13067,22 +13033,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_max_f16 v4, v4, v3
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB49_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13092,22 +13058,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13119,21 +13085,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_pk_max_f16 v4, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB49_1
@@ -13144,20 +13110,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB49_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13167,20 +13133,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v4, v2, v2
; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT: v_pk_max_f16 v3, v5, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB49_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13190,24 +13156,24 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v5, v7, v6
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v6, v6, v5
+; GFX8-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB49_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13266,21 +13232,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -13294,22 +13260,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_max_f16 v4, v4, v3
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB50_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13319,22 +13285,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13348,21 +13314,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_pk_max_f16 v4, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB50_1
@@ -13373,20 +13339,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB50_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13396,20 +13362,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v4, v2, v2
; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT: v_pk_max_f16 v3, v5, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB50_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13421,24 +13387,24 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v5, v7, v6
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v6, v6, v5
+; GFX8-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB50_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13500,21 +13466,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -13532,24 +13498,24 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX940-NEXT: s_movk_i32 s0, 0xf800
; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT: flat_load_dword v5, v[4:5]
+; GFX940-NEXT: flat_load_dword v3, v[4:5]
; GFX940-NEXT: s_mov_b32 s1, -1
; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_max_f16 v4, v4, v3
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB51_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13563,22 +13529,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v4, v[3:4]
+; GFX11-NEXT: flat_load_b32 v3, v[3:4]
+; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13592,21 +13558,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_pk_max_f16 v4, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB51_1
@@ -13623,12 +13589,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v0, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1
-; GFX90A-NEXT: v_pk_max_f16 v0, v3, v0
+; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -13650,12 +13616,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v2
; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v0, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v5, v1, v1
-; GFX908-NEXT: v_pk_max_f16 v0, v5, v0
+; GFX908-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX908-NEXT: v_pk_max_f16 v0, v0, v2
; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -13673,24 +13639,24 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v5, v7, v6
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v6, v6, v5
+; GFX8-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB51_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13753,15 +13719,15 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@@ -13784,14 +13750,14 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v2
; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: v_pk_max_f16 v3, v5, v5
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_max_f16 v4, v3, v4
+; GFX940-NEXT: v_pk_max_f16 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13809,15 +13775,15 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13838,21 +13804,21 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_pk_max_f16 v1, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_pk_max_f16 v0, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v5, v1, v1
-; GFX10-NEXT: v_pk_max_f16 v0, v5, v0
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_pk_max_f16 v0, v6, v6
+; GFX10-NEXT: v_pk_max_f16 v5, v0, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB52_1
@@ -13865,13 +13831,13 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4
+; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13891,13 +13857,13 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v2
; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_pk_max_f16 v5, v2, v2
; GFX908-NEXT: v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT: v_pk_max_f16 v3, v3, v5
+; GFX908-NEXT: v_pk_max_f16 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -13917,21 +13883,21 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v0, v7, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB52_1
@@ -13994,22 +13960,22 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -14023,22 +13989,22 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX940-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_max_f16 v4, v4, v3
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB53_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14048,22 +14014,22 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX11-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -14077,21 +14043,21 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_pk_max_f16 v4, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB53_1
@@ -14102,22 +14068,22 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX90A-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB53_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14127,20 +14093,20 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX908-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v4, v2, v2
; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT: v_pk_max_f16 v3, v5, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB53_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14152,24 +14118,24 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v5, v7, v6
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v6, v6, v5
+; GFX8-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB53_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14236,36 +14202,38 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -14282,35 +14250,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB54_1
@@ -14323,39 +14291,41 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14370,34 +14340,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_max_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB54_1
@@ -14411,33 +14381,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB54_1
@@ -14451,33 +14421,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_max_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB54_1
@@ -14491,34 +14461,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_max_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB54_1
@@ -14578,36 +14548,38 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -14624,35 +14596,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB55_1
@@ -14665,39 +14637,41 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14713,35 +14687,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v0
-; GFX10-NEXT: v_max_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB55_1
@@ -14754,33 +14728,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB55_1
@@ -14794,33 +14768,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_max_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB55_1
@@ -14836,34 +14810,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_max_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_max_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB55_1
@@ -14923,36 +14897,38 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -14977,35 +14953,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX940-NEXT: s_mov_b32 s1, -1
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v1, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX940-NEXT: v_max_f32_e32 v3, v6, v3
-; GFX940-NEXT: v_max_f32_e32 v0, v7, v0
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v0, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX940-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX940-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v0, v3, v0, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB56_1
@@ -15024,38 +15000,40 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: v_dual_max_f32 v0, v6, v0 :: v_dual_and_b32 v7, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v5, v7, v5
-; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15070,35 +15048,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v0
-; GFX10-NEXT: v_max_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB56_1
@@ -15115,33 +15093,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX90A-NEXT: v_max_f32_e32 v3, v6, v3
-; GFX90A-NEXT: v_max_f32_e32 v0, v7, v0
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX90A-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB56_1
@@ -15158,33 +15136,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX908-NEXT: v_max_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_max_f32_e32 v0, v7, v0
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v0, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
-; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX908-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9
+; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB56_1
@@ -15199,34 +15177,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_max_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_max_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB56_1
@@ -15285,38 +15263,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -15330,38 +15308,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB57_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15371,41 +15349,41 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15418,35 +15396,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB57_1
@@ -15457,36 +15435,36 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB57_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15496,36 +15474,36 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB57_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15535,37 +15513,37 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB57_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15620,38 +15598,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -15665,38 +15643,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB58_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15706,41 +15684,41 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_max_f32_e32 v6, v6, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15755,35 +15733,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB58_1
@@ -15794,36 +15772,36 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB58_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15833,36 +15811,36 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB58_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15874,37 +15852,37 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB58_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15962,38 +15940,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -16011,40 +15989,40 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX940-NEXT: s_movk_i32 s0, 0xf800
; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT: flat_load_dword v5, v[4:5]
+; GFX940-NEXT: flat_load_dword v3, v[4:5]
; GFX940-NEXT: s_mov_b32 s1, -1
; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB59_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16058,41 +16036,41 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v4, v[3:4]
+; GFX11-NEXT: flat_load_b32 v3, v[3:4]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16107,35 +16085,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB59_1
@@ -16152,28 +16130,28 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX90A-NEXT: v_max_f32_e32 v0, v3, v0
-; GFX90A-NEXT: v_max_f32_e32 v3, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX90A-NEXT: v_max_f32_e32 v6, v6, v2
+; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -16195,28 +16173,28 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX908-NEXT: v_max_f32_e32 v0, v5, v0
-; GFX908-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX908-NEXT: v_max_f32_e32 v0, v0, v5
+; GFX908-NEXT: v_max_f32_e32 v6, v6, v2
+; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9
; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -16234,37 +16212,37 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB59_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16323,37 +16301,39 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -16370,35 +16350,35 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB60_1
@@ -16411,39 +16391,41 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16459,35 +16441,35 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v0
-; GFX10-NEXT: v_max_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB60_1
@@ -16500,35 +16482,35 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB60_1
@@ -16542,33 +16524,33 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_max_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB60_1
@@ -16584,34 +16566,34 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_max_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_max_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB60_1
@@ -16670,39 +16652,39 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -16716,38 +16698,38 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX940-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB61_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16757,41 +16739,41 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX11-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16806,35 +16788,35 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB61_1
@@ -16845,38 +16827,38 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX90A-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB61_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16886,36 +16868,36 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX908-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB61_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16927,37 +16909,37 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB61_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index daa3df680e5ca0..d96d3db9f005df 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -31,13 +31,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -76,13 +76,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -100,13 +100,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -173,13 +173,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -220,13 +220,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -244,13 +244,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -329,18 +329,18 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX940-NEXT: s_mov_b32 s1, -1
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v1, v2, v2
; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v1, v0
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX940-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX940-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
+; GFX940-NEXT: v_max_f32_e32 v0, v3, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v0, v1
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB2_1
@@ -381,17 +381,17 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v0, v1
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB2_1
@@ -408,17 +408,17 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX908-NEXT: v_min_f32_e32 v0, v0, v5
-; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v6, v6
+; GFX908-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB2_1
@@ -482,21 +482,21 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB3_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -528,20 +528,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB3_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -551,20 +551,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB3_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -622,21 +622,21 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB4_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -670,20 +670,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB4_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -693,20 +693,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB4_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -773,23 +773,23 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX940-NEXT: s_movk_i32 s0, 0xf800
; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT: flat_load_dword v5, v[4:5]
+; GFX940-NEXT: flat_load_dword v3, v[4:5]
; GFX940-NEXT: s_mov_b32 s1, -1
; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB5_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -831,12 +831,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1
-; GFX90A-NEXT: v_min_f32_e32 v0, v3, v0
+; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -858,12 +858,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v0, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v1, v1
-; GFX908-NEXT: v_min_f32_e32 v0, v5, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT: v_min_f32_e32 v0, v0, v2
; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -935,13 +935,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -982,13 +982,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1008,13 +1008,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1084,21 +1084,21 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB7_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1132,22 +1132,22 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB7_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1157,20 +1157,20 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1235,13 +1235,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1259,14 +1259,15 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_max_f32 v3, v2, v2
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v5, v3
+; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1286,14 +1287,14 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX10-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v5, v3
+; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1313,13 +1314,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -1337,13 +1338,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1426,13 +1427,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1471,13 +1472,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -1495,13 +1496,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1572,13 +1573,13 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1617,13 +1618,13 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -1641,13 +1642,13 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1714,13 +1715,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1761,13 +1762,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -1785,13 +1786,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1870,18 +1871,18 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX940-NEXT: s_mov_b32 s1, -1
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v1, v2, v2
; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v1, v0
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX940-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX940-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
+; GFX940-NEXT: v_max_f32_e32 v0, v3, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v0, v1
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB12_1
@@ -1922,17 +1923,17 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v0, v1
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
@@ -1949,17 +1950,17 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX908-NEXT: v_min_f32_e32 v0, v0, v5
-; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v6, v6
+; GFX908-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB12_1
@@ -2023,21 +2024,21 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB13_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2069,20 +2070,20 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2092,20 +2093,20 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2163,21 +2164,21 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB14_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2211,20 +2212,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2234,20 +2235,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB14_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2314,23 +2315,23 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX940-NEXT: s_movk_i32 s0, 0xf800
; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT: flat_load_dword v5, v[4:5]
+; GFX940-NEXT: flat_load_dword v3, v[4:5]
; GFX940-NEXT: s_mov_b32 s1, -1
; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB15_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2372,12 +2373,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1
-; GFX90A-NEXT: v_min_f32_e32 v0, v3, v0
+; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -2399,12 +2400,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v0, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v1, v1
-; GFX908-NEXT: v_min_f32_e32 v0, v5, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT: v_min_f32_e32 v0, v0, v2
; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -2476,13 +2477,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2523,13 +2524,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2549,13 +2550,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -2625,21 +2626,21 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB17_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2673,22 +2674,22 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2698,20 +2699,20 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB17_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2769,29 +2770,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
-; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execz .LBB18_4
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
+; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -2800,25 +2801,24 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: ; %bb.3: ; %Flow
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX12-NEXT: .LBB18_4: ; %Flow2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX12-NEXT: s_cbranch_execz .LBB18_6
; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off
; GFX12-NEXT: .LBB18_6: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -2865,29 +2865,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execz .LBB18_4
; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -2895,23 +2895,22 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX11-NEXT: ; %bb.3: ; %Flow
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX11-NEXT: .LBB18_4: ; %Flow2
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB18_6
; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off
; GFX11-NEXT: .LBB18_6: ; %atomicrmw.phi
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -3001,91 +3000,90 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX908-NEXT: s_cbranch_execz .LBB18_4
; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT: v_mov_b32_e32 v9, v3
+; GFX908-NEXT: v_mov_b32_e32 v8, v2
+; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB18_2
; GFX908-NEXT: ; %bb.3: ; %Flow
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX908-NEXT: .LBB18_4: ; %Flow2
; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_cbranch_execz .LBB18_6
; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
-; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
; GFX908-NEXT: .LBB18_6: ; %atomicrmw.phi
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: v_mov_b32_e32 v0, v2
+; GFX908-NEXT: v_mov_b32_e32 v1, v3
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0
; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX8-NEXT: s_cbranch_execz .LBB18_4
; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v5, v[4:5]
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v3, v[2:3]
+; GFX8-NEXT: flat_load_dword v2, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB18_2
; GFX8-NEXT: ; %bb.3: ; %Flow
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX8-NEXT: .LBB18_4: ; %Flow2
; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_cbranch_execz .LBB18_6
@@ -3093,18 +3091,17 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6
-; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen
-; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen
; GFX8-NEXT: .LBB18_6: ; %atomicrmw.phi
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v4
-; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: v_mov_b32_e32 v0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, v3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -3162,6 +3159,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
@@ -3188,10 +3186,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[8:9]
-; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[6:7], v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9]
+; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3210,7 +3207,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_cbranch_execz .LBB19_2
; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3266,6 +3262,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
@@ -3288,10 +3285,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9]
-; GFX11-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX11-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3310,7 +3306,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX11-NEXT: s_cbranch_execz .LBB19_2
; GFX11-NEXT: .LBB19_6: ; %atomicrmw.private
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3408,6 +3403,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0
; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base
@@ -3430,9 +3426,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v9, v1
; GFX908-NEXT: v_mov_b32_e32 v8, v0
-; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7]
+; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -3448,7 +3443,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX908-NEXT: s_cbranch_execz .LBB19_2
; GFX908-NEXT: .LBB19_6: ; %atomicrmw.private
; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
@@ -3464,6 +3458,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0
; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0
@@ -3491,9 +3486,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v1
; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7]
+; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -3509,7 +3503,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX8-NEXT: s_cbranch_execz .LBB19_2
; GFX8-NEXT: .LBB19_6: ; %atomicrmw.private
; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6
; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
@@ -3578,6 +3571,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
@@ -3604,10 +3598,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[8:9]
-; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[6:7], v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9]
+; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3626,7 +3619,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_cbranch_execz .LBB20_2
; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3683,6 +3675,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
@@ -3705,10 +3698,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9]
-; GFX11-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX11-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3727,7 +3719,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX11-NEXT: s_cbranch_execz .LBB20_2
; GFX11-NEXT: .LBB20_6: ; %atomicrmw.private
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3825,6 +3816,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base
@@ -3847,9 +3839,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v9, v1
; GFX908-NEXT: v_mov_b32_e32 v8, v0
-; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7]
+; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -3865,7 +3856,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX908-NEXT: s_cbranch_execz .LBB20_2
; GFX908-NEXT: .LBB20_6: ; %atomicrmw.private
; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
@@ -3881,6 +3871,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0
; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0
@@ -3908,9 +3899,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v1
; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7]
+; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -3926,7 +3916,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX8-NEXT: s_cbranch_execz .LBB20_2
; GFX8-NEXT: .LBB20_6: ; %atomicrmw.private
; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6
; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
@@ -3995,6 +3984,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -4011,21 +4001,20 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global
-; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
+; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -4034,19 +4023,18 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: ; %bb.5: ; %Flow
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX12-NEXT: s_cbranch_execz .LBB21_2
; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
-; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off
+; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
+; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off
+; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[6:7]
+; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -4092,6 +4080,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1
@@ -4104,22 +4093,21 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global
-; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1]
+; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1]
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -4127,19 +4115,18 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX11-NEXT: ; %bb.5: ; %Flow
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB21_2
; GFX11-NEXT: .LBB21_6: ; %atomicrmw.private
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off
+; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
+; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off
+; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -4225,6 +4212,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -4237,40 +4225,38 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global
-; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v5, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v4, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB21_4
; GFX908-NEXT: ; %bb.5: ; %Flow
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_cbranch_execz .LBB21_2
; GFX908-NEXT: .LBB21_6: ; %atomicrmw.private
; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
@@ -4278,6 +4264,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0
; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -4292,44 +4279,42 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
; GFX8-NEXT: .LBB21_3: ; %atomicrmw.global
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[4:5]
-; GFX8-NEXT: flat_load_dword v6, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v5, v[2:3]
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB21_4
; GFX8-NEXT: ; %bb.5: ; %Flow
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_cbranch_execz .LBB21_2
; GFX8-NEXT: .LBB21_6: ; %atomicrmw.private
; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4
-; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7]
+; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -4385,13 +4370,14 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_3
; GFX12-NEXT: ; %bb.1: ; %Flow2
@@ -4404,21 +4390,20 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global
-; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
+; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7]
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -4426,20 +4411,19 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_cbranch_execnz .LBB22_4
; GFX12-NEXT: ; %bb.5: ; %Flow
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX12-NEXT: s_cbranch_execz .LBB22_2
; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private
-; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
-; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off
+; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off
+; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -4487,12 +4471,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB22_3
; GFX11-NEXT: ; %bb.1: ; %Flow2
@@ -4502,42 +4487,40 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-NEXT: .LBB22_3: ; %atomicrmw.global
-; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1]
+; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7]
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: .LBB22_4: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: s_cbranch_execnz .LBB22_4
; GFX11-NEXT: ; %bb.5: ; %Flow
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB22_2
; GFX11-NEXT: .LBB22_6: ; %atomicrmw.private
-; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off
+; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off
+; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -4627,10 +4610,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0x7f8, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB22_3
@@ -4641,40 +4625,38 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
; GFX908-NEXT: .LBB22_3: ; %atomicrmw.global
-; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: .LBB22_4: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB22_4
; GFX908-NEXT: ; %bb.5: ; %Flow
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_cbranch_execz .LBB22_2
; GFX908-NEXT: .LBB22_6: ; %atomicrmw.private
-; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
@@ -4682,12 +4664,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0
; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7f8, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
+; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB22_3
@@ -4698,44 +4681,42 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
; GFX8-NEXT: .LBB22_3: ; %atomicrmw.global
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[4:5]
-; GFX8-NEXT: flat_load_dword v6, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[6:7]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: .LBB22_4: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB22_4
; GFX8-NEXT: ; %bb.5: ; %Flow
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_cbranch_execz .LBB22_2
; GFX8-NEXT: .LBB22_6: ; %atomicrmw.private
-; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4
-; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -4794,13 +4775,14 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_3
; GFX12-NEXT: ; %bb.1: ; %Flow2
@@ -4813,21 +4795,20 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global
-; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
+; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7]
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -4835,20 +4816,19 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_cbranch_execnz .LBB23_4
; GFX12-NEXT: ; %bb.5: ; %Flow
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX12-NEXT: s_cbranch_execz .LBB23_2
; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private
-; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
-; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off
+; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off
+; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -4897,12 +4877,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB23_3
; GFX11-NEXT: ; %bb.1: ; %Flow2
@@ -4912,42 +4893,40 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-NEXT: .LBB23_3: ; %atomicrmw.global
-; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1]
+; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7]
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: .LBB23_4: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: s_cbranch_execnz .LBB23_4
; GFX11-NEXT: ; %bb.5: ; %Flow
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB23_2
; GFX11-NEXT: .LBB23_6: ; %atomicrmw.private
-; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
-; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off
+; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off
+; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -5037,10 +5016,11 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc
; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB23_3
@@ -5051,40 +5031,38 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
; GFX908-NEXT: .LBB23_3: ; %atomicrmw.global
-; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: .LBB23_4: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB23_4
; GFX908-NEXT: ; %bb.5: ; %Flow
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_cbranch_execz .LBB23_2
; GFX908-NEXT: .LBB23_6: ; %atomicrmw.private
-; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
@@ -5092,12 +5070,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0
; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
+; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB23_3
@@ -5108,44 +5087,42 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
; GFX8-NEXT: .LBB23_3: ; %atomicrmw.global
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[4:5]
-; GFX8-NEXT: flat_load_dword v6, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[6:7]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: .LBB23_4: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB23_4
; GFX8-NEXT: ; %bb.5: ; %Flow
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_cbranch_execz .LBB23_2
; GFX8-NEXT: .LBB23_6: ; %atomicrmw.private
-; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4
-; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -5204,29 +5181,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
-; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execz .LBB24_4
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
+; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -5235,25 +5212,24 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: ; %bb.3: ; %Flow
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX12-NEXT: .LBB24_4: ; %Flow2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX12-NEXT: s_cbranch_execz .LBB24_6
; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off
; GFX12-NEXT: .LBB24_6: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -5300,29 +5276,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execz .LBB24_4
; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: .LBB24_2: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -5330,78 +5306,76 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX11-NEXT: ; %bb.3: ; %Flow
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX11-NEXT: .LBB24_4: ; %Flow2
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB24_6
; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off
; GFX11-NEXT: .LBB24_6: ; %atomicrmw.phi
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1
; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4
; GFX10-NEXT: s_cbranch_execz .LBB24_4
; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB24_2: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX10-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX10-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v9, v3
+; GFX10-NEXT: v_mov_b32_e32 v8, v2
+; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX10-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB24_2
; GFX10-NEXT: ; %bb.3: ; %Flow
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX10-NEXT: .LBB24_4: ; %Flow2
; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4
; GFX10-NEXT: s_cbranch_execz .LBB24_6
; GFX10-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX10-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; GFX10-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX10-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
; GFX10-NEXT: .LBB24_6: ; %atomicrmw.phi
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-NEXT: v_mov_b32_e32 v1, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
@@ -5409,140 +5383,138 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB24_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX90A-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5]
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB24_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB24_4: ; %Flow2
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB24_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX908-NEXT: s_cbranch_execz .LBB24_4
; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: .LBB24_2: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT: v_mov_b32_e32 v9, v3
+; GFX908-NEXT: v_mov_b32_e32 v8, v2
+; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB24_2
; GFX908-NEXT: ; %bb.3: ; %Flow
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX908-NEXT: .LBB24_4: ; %Flow2
; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_cbranch_execz .LBB24_6
; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
-; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
; GFX908-NEXT: .LBB24_6: ; %atomicrmw.phi
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: v_mov_b32_e32 v0, v2
+; GFX908-NEXT: v_mov_b32_e32 v1, v3
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0
; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX8-NEXT: s_cbranch_execz .LBB24_4
; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v5, v[4:5]
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v3, v[2:3]
+; GFX8-NEXT: flat_load_dword v2, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: .LBB24_2: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB24_2
; GFX8-NEXT: ; %bb.3: ; %Flow
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX8-NEXT: .LBB24_4: ; %Flow2
; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_cbranch_execz .LBB24_6
@@ -5550,18 +5522,17 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6
-; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen
-; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen
; GFX8-NEXT: .LBB24_6: ; %atomicrmw.phi
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v4
-; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: v_mov_b32_e32 v0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, v3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -5570,37 +5541,37 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX7-NEXT: s_cbranch_execz .LBB24_4
; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v5, v[4:5]
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v3, v[2:3]
+; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: .LBB24_2: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX7-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX7-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5]
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB24_2
; GFX7-NEXT: ; %bb.3: ; %Flow
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX7-NEXT: .LBB24_4: ; %Flow2
; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_cbranch_execz .LBB24_6
@@ -5608,18 +5579,17 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6
-; GFX7-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX7-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen
-; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX7-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen
; GFX7-NEXT: .LBB24_6: ; %atomicrmw.phi
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v4
-; GFX7-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
@@ -5634,29 +5604,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
-; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execz .LBB25_4
; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
+; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -5665,25 +5635,24 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am
; GFX12-NEXT: ; %bb.3: ; %Flow
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX12-NEXT: .LBB25_4: ; %Flow2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX12-NEXT: s_cbranch_execz .LBB25_6
; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off
; GFX12-NEXT: .LBB25_6: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -5730,29 +5699,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execz .LBB25_4
; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: .LBB25_2: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -5760,23 +5729,22 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am
; GFX11-NEXT: ; %bb.3: ; %Flow
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX11-NEXT: .LBB25_4: ; %Flow2
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB25_6
; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
-; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off
; GFX11-NEXT: .LBB25_6: ; %atomicrmw.phi
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -5866,91 +5834,90 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX908-NEXT: s_cbranch_execz .LBB25_4
; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: .LBB25_2: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT: v_mov_b32_e32 v9, v3
+; GFX908-NEXT: v_mov_b32_e32 v8, v2
+; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB25_2
; GFX908-NEXT: ; %bb.3: ; %Flow
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX908-NEXT: .LBB25_4: ; %Flow2
; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_cbranch_execz .LBB25_6
; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
-; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
; GFX908-NEXT: .LBB25_6: ; %atomicrmw.phi
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: v_mov_b32_e32 v0, v2
+; GFX908-NEXT: v_mov_b32_e32 v1, v3
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0
; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX8-NEXT: s_cbranch_execz .LBB25_4
; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v5, v[4:5]
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v3, v[2:3]
+; GFX8-NEXT: flat_load_dword v2, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: .LBB25_2: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
-; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB25_2
; GFX8-NEXT: ; %bb.3: ; %Flow
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX8-NEXT: .LBB25_4: ; %Flow2
; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_cbranch_execz .LBB25_6
@@ -5958,18 +5925,17 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am
; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6
-; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
-; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen
-; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen
; GFX8-NEXT: .LBB25_6: ; %atomicrmw.phi
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v4
-; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: v_mov_b32_e32 v0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, v3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -6032,8 +5998,9 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
@@ -6045,12 +6012,11 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7
+; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -6083,14 +6049,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v4, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT: v_min_f16_e32 v5, v6, v5
+; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT: v_min_f16_e32 v5, v5, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc1
@@ -6110,8 +6076,9 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
@@ -6123,12 +6090,11 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v5, v5, v7
+; GFX11-NEXT: v_min_f16_e32 v5, v5, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -6152,6 +6118,7 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
@@ -6163,10 +6130,9 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX10-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT: v_min_f16_e32 v5, v5, v7
+; GFX10-NEXT: v_min_f16_e32 v5, v5, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -6195,14 +6161,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v4, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
@@ -6229,14 +6195,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX908-NEXT: v_not_b32_e32 v4, v4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT: v_min_f16_e32 v5, v7, v5
+; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT: v_min_f16_e32 v5, v5, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
@@ -6263,17 +6229,17 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT: v_min_f16_e32 v5, v7, v5
-; GFX8-NEXT: v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -6336,10 +6302,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6349,12 +6316,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7
+; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -6389,14 +6355,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v4, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT: v_min_f16_e32 v5, v6, v5
+; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT: v_min_f16_e32 v5, v5, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc1
@@ -6417,10 +6383,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6430,12 +6397,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v5, v5, v7
+; GFX11-NEXT: v_min_f16_e32 v5, v5, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -6460,9 +6426,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: flat_load_dword v5, v[0:1]
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6471,10 +6438,9 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX10-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT: v_min_f16_e32 v5, v5, v7
+; GFX10-NEXT: v_min_f16_e32 v5, v5, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -6504,14 +6470,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v4, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
@@ -6539,14 +6505,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX908-NEXT: v_not_b32_e32 v4, v4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT: v_min_f16_e32 v5, v7, v5
+; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT: v_min_f16_e32 v5, v5, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
@@ -6574,17 +6540,17 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT: v_min_f16_e32 v5, v7, v5
-; GFX8-NEXT: v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -6649,10 +6615,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6662,12 +6629,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7
+; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -6703,14 +6669,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v4, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT: v_min_f16_e32 v5, v6, v5
+; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT: v_min_f16_e32 v5, v5, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc1
@@ -6731,10 +6697,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6744,12 +6711,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v5, v5, v7
+; GFX11-NEXT: v_min_f16_e32 v5, v5, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -6774,9 +6740,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: flat_load_dword v5, v[0:1]
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -6785,10 +6752,9 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX10-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT: v_min_f16_e32 v5, v5, v7
+; GFX10-NEXT: v_min_f16_e32 v5, v5, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -6818,14 +6784,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v4, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
@@ -6853,14 +6819,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX908-NEXT: v_not_b32_e32 v4, v4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT: v_min_f16_e32 v5, v7, v5
+; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT: v_min_f16_e32 v5, v5, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
@@ -6888,17 +6854,17 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT: v_min_f16_e32 v5, v7, v5
-; GFX8-NEXT: v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -6962,8 +6928,9 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: flat_load_b32 v4, v[0:1]
@@ -6975,10 +6942,9 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7
+; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
@@ -7012,13 +6978,13 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v6, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_max_f16_e32 v7, v2, v2
; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT: v_min_f16_e32 v4, v4, v7
+; GFX940-NEXT: v_min_f16_e32 v4, v4, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX940-NEXT: buffer_wbl2 sc1
@@ -7038,8 +7004,9 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: flat_load_b32 v4, v[0:1]
@@ -7051,10 +7018,9 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: v_min_f16_e32 v3, v3, v7
+; GFX11-NEXT: v_min_f16_e32 v3, v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
@@ -7079,6 +7045,7 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
@@ -7090,9 +7057,8 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
; GFX10-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT: v_min_f16_e32 v3, v3, v7
+; GFX10-NEXT: v_min_f16_e32 v3, v3, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -7121,13 +7087,13 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2
; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7
+; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
@@ -7154,13 +7120,13 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
; GFX908-NEXT: v_not_b32_e32 v6, v3
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_max_f16_e32 v7, v2, v2
; GFX908-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT: v_min_f16_e32 v3, v3, v7
+; GFX908-NEXT: v_min_f16_e32 v3, v3, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
@@ -7187,16 +7153,16 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
; GFX8-NEXT: v_not_b32_e32 v6, v3
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_max_f16_e32 v7, v2, v2
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_min_f16_e32 v3, v3, v7
-; GFX8-NEXT: v_and_b32_e32 v8, v4, v6
+; GFX8-NEXT: v_min_f16_e32 v3, v3, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v8, v3
+; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -7255,36 +7221,36 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -7302,29 +7268,29 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v6, v2, v2
; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT: v_min_f16_e32 v4, v4, v7
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB30_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7334,37 +7300,37 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
+; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: v_min_f16_e32 v3, v3, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7376,31 +7342,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v5, v5
; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT: v_min_f16_e32 v3, v3, v7
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB30_1
@@ -7411,31 +7377,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB30_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7445,31 +7411,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v6, v2, v2
; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT: v_min_f16_e32 v3, v3, v7
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB30_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7479,32 +7445,32 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT: v_not_b32_e32 v5, v5
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_min_f16_e32 v3, v3, v7
-; GFX8-NEXT: v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB30_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7559,36 +7525,36 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -7607,29 +7573,29 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v6, v2, v2
; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT: v_min_f16_e32 v4, v4, v7
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB31_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7639,37 +7605,37 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
+; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: v_min_f16_e32 v3, v3, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7681,31 +7647,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v5, v5
; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT: v_min_f16_e32 v3, v3, v7
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB31_1
@@ -7716,31 +7682,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB31_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7750,31 +7716,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v6, v2, v2
; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT: v_min_f16_e32 v3, v3, v7
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB31_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7784,32 +7750,32 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT: v_not_b32_e32 v5, v5
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_min_f16_e32 v3, v3, v7
-; GFX8-NEXT: v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB31_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7865,15 +7831,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4
-; GFX12-NEXT: v_min_num_f16_e32 v3, v5, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4
+; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
@@ -7898,14 +7864,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: s_mov_b32 s2, 0xffff0000
; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f16_e32 v4, v2, v2
; GFX940-NEXT: v_max_f16_e32 v3, v5, v5
-; GFX940-NEXT: v_min_f16_e32 v3, v3, v4
+; GFX940-NEXT: v_min_f16_e32 v3, v3, v2
; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
@@ -7924,15 +7890,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: v_max_f16_e32 v3, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX11-NEXT: v_min_f16_e32 v3, v5, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f16_e32 v3, v4, v4
+; GFX11-NEXT: v_min_f16_e32 v3, v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
@@ -7956,23 +7922,23 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_max_f16_e32 v1, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_max_f16_e32 v0, v2, v2
-; GFX10-NEXT: v_max_f16_e32 v5, v1, v1
-; GFX10-NEXT: v_min_f16_e32 v0, v5, v0
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_max_f16_e32 v0, v6, v6
+; GFX10-NEXT: v_min_f16_e32 v0, v0, v1
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX10-NEXT: v_and_or_b32 v5, 0xffff0000, v6, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB32_1
@@ -7985,14 +7951,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f16_e32 v3, v3, v4
+; GFX90A-NEXT: v_min_f16_e32 v3, v3, v2
; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8011,14 +7977,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: s_mov_b32 s6, 0xffff0000
; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f16_e32 v5, v2, v2
; GFX908-NEXT: v_max_f16_e32 v3, v4, v4
-; GFX908-NEXT: v_min_f16_e32 v3, v3, v5
+; GFX908-NEXT: v_min_f16_e32 v3, v3, v2
; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8039,19 +8005,19 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v1, v2, v2
; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v0, v1, v1
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX8-NEXT: v_min_f16_e32 v0, v0, v5
-; GFX8-NEXT: v_or_b32_e32 v0, v6, v0
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_max_f16_e32 v0, v6, v6
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX8-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT: v_or_b32_e32 v5, v2, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB32_1
@@ -8101,24 +8067,24 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2046
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4
+; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v3, v5, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v4
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -8132,23 +8098,23 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2046
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v4, v2, v2
; GFX940-NEXT: s_mov_b32 s2, 0xffff0000
; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f16_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_max_f16_e32 v4, v5, v5
-; GFX940-NEXT: v_min_f16_e32 v3, v4, v3
-; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3
+; GFX940-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX940-NEXT: v_min_f16_e32 v2, v2, v4
+; GFX940-NEXT: v_and_or_b32 v2, v3, s2, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB33_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8158,25 +8124,25 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2046
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-NEXT: v_max_f16_e32 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_max_f16_e32 v3, v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e32 v5, v4, v4
+; GFX11-NEXT: v_max_f16_e32 v2, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v3, v5, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_min_f16_e32 v2, v2, v4
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -8190,23 +8156,23 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_max_f16_e32 v4, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_max_f16_e32 v3, v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX10-NEXT: v_min_f16_e32 v3, v5, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX10-NEXT: v_min_f16_e32 v2, v2, v4
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB33_1
@@ -8217,22 +8183,22 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2
; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f16_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f16_e32 v4, v5, v5
-; GFX90A-NEXT: v_min_f16_e32 v3, v4, v3
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
+; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f16_e32 v2, v2, v4
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB33_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8242,22 +8208,22 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2046
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v4, v2, v2
; GFX908-NEXT: s_mov_b32 s6, 0xffff0000
; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f16_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX908-NEXT: v_min_f16_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
+; GFX908-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX908-NEXT: v_min_f16_e32 v2, v2, v4
+; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB33_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8269,22 +8235,22 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v4, v2, v2
; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_e32 v3, v2, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT: v_min_f16_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v6, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_min_f16_e32 v2, v2, v4
+; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB33_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8334,10 +8300,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -8347,12 +8314,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7
+; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -8388,14 +8354,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v4, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT: v_min_f16_e32 v5, v6, v5
+; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT: v_min_f16_e32 v5, v5, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
@@ -8416,10 +8382,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -8429,12 +8396,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v5, v5, v7
+; GFX11-NEXT: v_min_f16_e32 v5, v5, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -8459,9 +8425,10 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: flat_load_dword v5, v[0:1]
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -8470,10 +8437,9 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX10-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT: v_min_f16_e32 v5, v5, v7
+; GFX10-NEXT: v_min_f16_e32 v5, v5, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -8503,14 +8469,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v4, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX90A-NEXT: buffer_wbl2
@@ -8540,14 +8506,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX908-NEXT: v_not_b32_e32 v4, v4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT: v_min_f16_e32 v5, v7, v5
+; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT: v_min_f16_e32 v5, v5, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
@@ -8575,17 +8541,17 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT: v_min_f16_e32 v5, v7, v5
-; GFX8-NEXT: v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -8648,37 +8614,37 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -8696,29 +8662,29 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v6, v2, v2
; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT: v_min_f16_e32 v4, v4, v7
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB35_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8728,37 +8694,37 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
+; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: v_min_f16_e32 v3, v3, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -8770,31 +8736,31 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX10-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v5, v5
; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT: v_min_f16_e32 v3, v3, v7
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB35_1
@@ -8805,33 +8771,33 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX90A-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB35_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8841,31 +8807,31 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX908-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v6, v2, v2
; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT: v_min_f16_e32 v3, v3, v7
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB35_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8875,32 +8841,32 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX8-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT: v_not_b32_e32 v5, v5
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_min_f16_e32 v3, v3, v7
-; GFX8-NEXT: v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB35_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12308,15 +12274,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -12338,14 +12304,14 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v2
; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: v_pk_max_f16 v3, v5, v5
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_min_f16 v4, v3, v4
+; GFX940-NEXT: v_pk_min_f16 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12363,15 +12329,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT: v_pk_min_f16 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12391,14 +12357,14 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX10-NEXT: v_pk_min_f16 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12418,13 +12384,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4
+; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -12442,13 +12408,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v2
; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_pk_max_f16 v5, v2, v2
; GFX908-NEXT: v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT: v_pk_min_f16 v3, v3, v5
+; GFX908-NEXT: v_pk_min_f16 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -12466,21 +12432,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v3, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v6, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v3, v7, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v3, v5
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT: v_min_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB46_1
@@ -12544,15 +12510,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -12574,14 +12540,14 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v2
; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: v_pk_max_f16 v3, v5, v5
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_min_f16 v4, v3, v4
+; GFX940-NEXT: v_pk_min_f16 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12599,15 +12565,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT: v_pk_min_f16 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12628,21 +12594,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_pk_max_f16 v1, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_pk_max_f16 v0, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v5, v1, v1
-; GFX10-NEXT: v_pk_min_f16 v0, v5, v0
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_pk_max_f16 v0, v6, v6
+; GFX10-NEXT: v_pk_min_f16 v5, v0, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB47_1
@@ -12655,13 +12621,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4
+; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -12679,13 +12645,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v2
; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_pk_max_f16 v5, v2, v2
; GFX908-NEXT: v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT: v_pk_min_f16 v3, v3, v5
+; GFX908-NEXT: v_pk_min_f16 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -12705,21 +12671,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v0, v7, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB47_1
@@ -12783,15 +12749,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -12821,19 +12787,19 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX940-NEXT: s_mov_b32 s1, -1
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v1, v2, v2
; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v1, v0
-; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX940-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
+; GFX940-NEXT: v_pk_max_f16 v0, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_min_f16 v0, v0, v3
+; GFX940-NEXT: v_pk_min_f16 v2, v0, v1
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB48_1
@@ -12852,20 +12818,20 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_pk_max_f16 v1, v2, v2
; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, v0
-; GFX11-NEXT: v_pk_max_f16 v0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v5, v1, v1
-; GFX11-NEXT: v_pk_min_f16 v0, v5, v0
+; GFX11-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v0, v6, v6
+; GFX11-NEXT: v_pk_min_f16 v5, v0, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12879,21 +12845,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_pk_max_f16 v1, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_pk_max_f16 v0, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v5, v1, v1
-; GFX10-NEXT: v_pk_min_f16 v0, v5, v0
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_pk_max_f16 v0, v6, v6
+; GFX10-NEXT: v_pk_min_f16 v5, v0, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB48_1
@@ -12910,17 +12876,17 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2
; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
-; GFX90A-NEXT: v_pk_min_f16 v0, v0, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_pk_max_f16 v0, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v2, v0, v1
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB48_1
@@ -12937,17 +12903,17 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v1, v2, v2
; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: v_pk_max_f16 v5, v2, v2
-; GFX908-NEXT: v_pk_max_f16 v0, v1, v1
-; GFX908-NEXT: v_pk_min_f16 v0, v0, v5
-; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: v_pk_max_f16 v0, v6, v6
+; GFX908-NEXT: v_pk_min_f16 v5, v0, v1
+; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB48_1
@@ -12962,21 +12928,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v0, v7, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB48_1
@@ -13039,21 +13005,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -13067,22 +13033,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_min_f16 v4, v4, v3
+; GFX940-NEXT: v_pk_min_f16 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB49_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13092,22 +13058,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT: v_pk_min_f16 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13119,21 +13085,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_pk_max_f16 v4, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT: v_pk_min_f16 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB49_1
@@ -13144,20 +13110,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB49_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13167,20 +13133,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v4, v2, v2
; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT: v_pk_min_f16 v3, v5, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB49_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13190,24 +13156,24 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v5, v7, v6
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v6, v6, v5
+; GFX8-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB49_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13266,21 +13232,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -13294,22 +13260,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_min_f16 v4, v4, v3
+; GFX940-NEXT: v_pk_min_f16 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB50_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13319,22 +13285,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT: v_pk_min_f16 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13348,21 +13314,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_pk_max_f16 v4, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT: v_pk_min_f16 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB50_1
@@ -13373,20 +13339,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB50_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13396,20 +13362,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v4, v2, v2
; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT: v_pk_min_f16 v3, v5, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB50_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13421,24 +13387,24 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v5, v7, v6
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v6, v6, v5
+; GFX8-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB50_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13500,21 +13466,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -13532,24 +13498,24 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX940-NEXT: s_movk_i32 s0, 0xf800
; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT: flat_load_dword v5, v[4:5]
+; GFX940-NEXT: flat_load_dword v3, v[4:5]
; GFX940-NEXT: s_mov_b32 s1, -1
; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_min_f16 v4, v4, v3
+; GFX940-NEXT: v_pk_min_f16 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB51_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13563,22 +13529,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v4, v[3:4]
+; GFX11-NEXT: flat_load_b32 v3, v[3:4]
+; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT: v_pk_min_f16 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13592,21 +13558,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_pk_max_f16 v4, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT: v_pk_min_f16 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB51_1
@@ -13623,12 +13589,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v0, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1
-; GFX90A-NEXT: v_pk_min_f16 v0, v3, v0
+; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -13650,12 +13616,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v2
; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v0, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v5, v1, v1
-; GFX908-NEXT: v_pk_min_f16 v0, v5, v0
+; GFX908-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX908-NEXT: v_pk_min_f16 v0, v0, v2
; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -13673,24 +13639,24 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v5, v7, v6
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v6, v6, v5
+; GFX8-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB51_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13753,15 +13719,15 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@@ -13784,14 +13750,14 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v2
; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: v_pk_max_f16 v3, v5, v5
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_min_f16 v4, v3, v4
+; GFX940-NEXT: v_pk_min_f16 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13809,15 +13775,15 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT: v_pk_min_f16 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13838,21 +13804,21 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_pk_max_f16 v1, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_pk_max_f16 v0, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v5, v1, v1
-; GFX10-NEXT: v_pk_min_f16 v0, v5, v0
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_pk_max_f16 v0, v6, v6
+; GFX10-NEXT: v_pk_min_f16 v5, v0, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB52_1
@@ -13865,13 +13831,13 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4
+; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13891,13 +13857,13 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v2
; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_pk_max_f16 v5, v2, v2
; GFX908-NEXT: v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT: v_pk_min_f16 v3, v3, v5
+; GFX908-NEXT: v_pk_min_f16 v3, v3, v2
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -13917,21 +13883,21 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v0, v7, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB52_1
@@ -13994,22 +13960,22 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -14023,22 +13989,22 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX940-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_min_f16 v4, v4, v3
+; GFX940-NEXT: v_pk_min_f16 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB53_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14048,22 +14014,22 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX11-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT: v_pk_min_f16 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -14077,21 +14043,21 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_pk_max_f16 v4, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT: v_pk_min_f16 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB53_1
@@ -14102,22 +14068,22 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB53_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14127,20 +14093,20 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX908-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v4, v2, v2
; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT: v_pk_min_f16 v3, v5, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB53_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14152,24 +14118,24 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v5, v7, v6
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v6, v6, v5
+; GFX8-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB53_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14236,36 +14202,38 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -14282,35 +14250,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB54_1
@@ -14323,39 +14291,41 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14370,34 +14340,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_min_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB54_1
@@ -14411,33 +14381,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB54_1
@@ -14451,33 +14421,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_min_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB54_1
@@ -14491,34 +14461,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_min_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB54_1
@@ -14578,36 +14548,38 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -14624,35 +14596,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB55_1
@@ -14665,39 +14637,41 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14713,35 +14687,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT: v_min_f32_e32 v0, v6, v0
-; GFX10-NEXT: v_min_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB55_1
@@ -14754,33 +14728,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB55_1
@@ -14794,33 +14768,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_min_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB55_1
@@ -14836,34 +14810,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_min_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_min_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB55_1
@@ -14923,36 +14897,38 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -14977,35 +14953,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX940-NEXT: s_mov_b32 s1, -1
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v1, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX940-NEXT: v_min_f32_e32 v3, v6, v3
-; GFX940-NEXT: v_min_f32_e32 v0, v7, v0
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v0, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX940-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX940-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v0, v3, v0, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB56_1
@@ -15024,38 +15000,40 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: v_dual_min_f32 v0, v6, v0 :: v_dual_and_b32 v7, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v5, v7, v5
-; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15070,35 +15048,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT: v_min_f32_e32 v0, v6, v0
-; GFX10-NEXT: v_min_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB56_1
@@ -15115,33 +15093,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX90A-NEXT: v_min_f32_e32 v3, v6, v3
-; GFX90A-NEXT: v_min_f32_e32 v0, v7, v0
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX90A-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB56_1
@@ -15158,33 +15136,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX908-NEXT: v_min_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_min_f32_e32 v0, v7, v0
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v0, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
-; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX908-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9
+; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB56_1
@@ -15199,34 +15177,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_min_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_min_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB56_1
@@ -15285,38 +15263,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -15330,38 +15308,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_min_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB57_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15371,41 +15349,41 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15418,35 +15396,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB57_1
@@ -15457,36 +15435,36 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB57_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15496,36 +15474,36 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB57_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15535,37 +15513,37 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB57_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15620,38 +15598,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -15665,38 +15643,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_min_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB58_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15706,41 +15684,41 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_min_f32_e32 v6, v6, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15755,35 +15733,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB58_1
@@ -15794,36 +15772,36 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB58_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15833,36 +15811,36 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB58_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15874,37 +15852,37 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB58_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15962,38 +15940,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -16011,40 +15989,40 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX940-NEXT: s_movk_i32 s0, 0xf800
; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT: flat_load_dword v5, v[4:5]
+; GFX940-NEXT: flat_load_dword v3, v[4:5]
; GFX940-NEXT: s_mov_b32 s1, -1
; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_min_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB59_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16058,41 +16036,41 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v4, v[3:4]
+; GFX11-NEXT: flat_load_b32 v3, v[3:4]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16107,35 +16085,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB59_1
@@ -16152,28 +16130,28 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX90A-NEXT: v_min_f32_e32 v0, v3, v0
-; GFX90A-NEXT: v_min_f32_e32 v3, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX90A-NEXT: v_min_f32_e32 v6, v6, v2
+; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -16195,28 +16173,28 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX908-NEXT: v_min_f32_e32 v0, v5, v0
-; GFX908-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX908-NEXT: v_min_f32_e32 v0, v0, v5
+; GFX908-NEXT: v_min_f32_e32 v6, v6, v2
+; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9
; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -16234,37 +16212,37 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB59_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16323,37 +16301,39 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -16370,35 +16350,35 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB60_1
@@ -16411,39 +16391,41 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16459,35 +16441,35 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT: v_min_f32_e32 v0, v6, v0
-; GFX10-NEXT: v_min_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB60_1
@@ -16500,35 +16482,35 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB60_1
@@ -16542,33 +16524,33 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_min_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB60_1
@@ -16584,34 +16566,34 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_min_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_min_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB60_1
@@ -16670,39 +16652,39 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -16716,38 +16698,38 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX940-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_min_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB61_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16757,41 +16739,41 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX11-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16806,35 +16788,35 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB61_1
@@ -16845,38 +16827,38 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB61_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16886,36 +16868,36 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX908-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB61_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16927,37 +16909,37 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB61_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index 1b56fd020d5020..14f75814128f18 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -13729,36 +13729,38 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -13775,35 +13777,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB50_1
@@ -13816,39 +13818,41 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -13863,34 +13867,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB50_1
@@ -13904,33 +13908,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB50_1
@@ -13944,33 +13948,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB50_1
@@ -13984,34 +13988,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB50_1
@@ -14071,36 +14075,38 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -14117,35 +14123,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB51_1
@@ -14158,39 +14164,41 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14206,35 +14214,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT: v_sub_f32_e32 v0, v6, v0
-; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB51_1
@@ -14247,33 +14255,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB51_1
@@ -14287,33 +14295,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB51_1
@@ -14329,34 +14337,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB51_1
@@ -14416,36 +14424,38 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -14470,35 +14480,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX940-NEXT: s_mov_b32 s1, -1
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v1, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX940-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX940-NEXT: v_sub_f32_e32 v0, v7, v0
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v0, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX940-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX940-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v0, v3, v0, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB52_1
@@ -14517,38 +14527,40 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: v_dual_sub_f32 v0, v6, v0 :: v_dual_and_b32 v7, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v5, v7, v5
-; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14563,35 +14575,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT: v_sub_f32_e32 v0, v6, v0
-; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB52_1
@@ -14608,33 +14620,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX90A-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX90A-NEXT: v_sub_f32_e32 v0, v7, v0
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB52_1
@@ -14651,33 +14663,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_sub_f32_e32 v0, v7, v0
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v0, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
-; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9
+; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB52_1
@@ -14692,34 +14704,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB52_1
@@ -14778,38 +14790,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -14823,38 +14835,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB53_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14864,41 +14876,41 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14911,35 +14923,35 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB53_1
@@ -14950,36 +14962,36 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB53_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14989,36 +15001,36 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB53_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15028,37 +15040,37 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB53_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15113,38 +15125,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -15158,38 +15170,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB54_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15199,41 +15211,41 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15248,35 +15260,35 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB54_1
@@ -15287,36 +15299,36 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB54_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15326,36 +15338,36 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB54_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15367,37 +15379,37 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB54_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15455,38 +15467,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -15504,40 +15516,40 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX940-NEXT: s_movk_i32 s0, 0xf800
; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX940-NEXT: flat_load_dword v5, v[4:5]
+; GFX940-NEXT: flat_load_dword v3, v[4:5]
; GFX940-NEXT: s_mov_b32 s1, -1
; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB55_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15551,41 +15563,41 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v4, v[3:4]
+; GFX11-NEXT: flat_load_b32 v3, v[3:4]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15600,35 +15612,35 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB55_1
@@ -15645,28 +15657,28 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX90A-NEXT: v_sub_f32_e32 v0, v3, v0
-; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v3
+; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v2
+; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -15688,28 +15700,28 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX908-NEXT: v_sub_f32_e32 v0, v5, v0
-; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX908-NEXT: v_sub_f32_e32 v0, v0, v5
+; GFX908-NEXT: v_sub_f32_e32 v6, v6, v2
+; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9
; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -15727,37 +15739,37 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB55_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15816,37 +15828,39 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -15863,35 +15877,35 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB56_1
@@ -15904,39 +15918,41 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15952,35 +15968,35 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX10-NEXT: v_sub_f32_e32 v0, v6, v0
-; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB56_1
@@ -15993,35 +16009,35 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB56_1
@@ -16035,33 +16051,33 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB56_1
@@ -16077,34 +16093,34 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB56_1
@@ -16163,39 +16179,39 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -16209,38 +16225,38 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX940-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB57_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16250,41 +16266,41 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX11-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16299,35 +16315,35 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB57_1
@@ -16338,38 +16354,38 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX90A-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB57_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16379,36 +16395,36 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX908-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB57_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16420,37 +16436,37 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB57_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
index 61549fa88f4244..1311560715ddd7 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
@@ -2057,21 +2057,19 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
; GCN1-NEXT: s_mov_b64 s[34:35], 0
; GCN1-NEXT: .LBB54_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_and_b32_e32 v0, s6, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
-; GCN1-NEXT: v_mov_b32_e32 v3, s5
-; GCN1-NEXT: v_not_b32_e32 v0, v0
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_and_b32_e32 v2, s6, v3
+; GCN1-NEXT: v_not_b32_e32 v2, v2
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB54_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2083,21 +2081,19 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
; GCN2-NEXT: s_mov_b64 s[34:35], 0
; GCN2-NEXT: .LBB54_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_and_b32_e32 v0, s6, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
-; GCN2-NEXT: v_mov_b32_e32 v3, s5
-; GCN2-NEXT: v_not_b32_e32 v0, v0
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_and_b32_e32 v2, s6, v3
+; GCN2-NEXT: v_not_b32_e32 v2, v2
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB54_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2109,21 +2105,19 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dword v1, v[0:1]
+; GCN3-NEXT: flat_load_dword v3, v[0:1]
; GCN3-NEXT: s_mov_b64 s[34:35], 0
; GCN3-NEXT: .LBB54_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_and_b32_e32 v0, s6, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: v_not_b32_e32 v0, v0
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT: v_and_b32_e32 v2, s6, v3
+; GCN3-NEXT: v_not_b32_e32 v2, v2
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB54_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2141,25 +2135,23 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: s_addc_u32 s35, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s34
; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_and_b32_e32 v0, s6, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, s34
-; GCN1-NEXT: v_mov_b32_e32 v3, s35
-; GCN1-NEXT: v_not_b32_e32 v0, v0
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_and_b32_e32 v2, s6, v3
+; GCN1-NEXT: v_not_b32_e32 v2, v2
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB55_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_nand_i32_noret_offset_scalar:
@@ -2169,25 +2161,23 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: s_addc_u32 s35, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s34
; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: flat_load_dword v1, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_and_b32_e32 v0, s6, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, s34
-; GCN2-NEXT: v_mov_b32_e32 v3, s35
-; GCN2-NEXT: v_not_b32_e32 v0, v0
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_and_b32_e32 v2, s6, v3
+; GCN2-NEXT: v_not_b32_e32 v2, v2
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB55_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_nand_i32_noret_offset_scalar:
@@ -2195,21 +2185,19 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
+; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[34:35], 0
; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_and_b32_e32 v0, s6, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: v_not_b32_e32 v0, v0
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT: v_and_b32_e32 v2, s6, v3
+; GCN3-NEXT: v_not_b32_e32 v2, v2
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB55_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2227,19 +2215,19 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: flat_load_dword v0, v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v1, s4
; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v2, s5
; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
-; GCN1-NEXT: v_and_b32_e32 v0, s6, v1
-; GCN1-NEXT: v_mov_b32_e32 v3, s5
-; GCN1-NEXT: v_not_b32_e32 v0, v0
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_mov_b32_e32 v4, v0
+; GCN1-NEXT: v_and_b32_e32 v0, s6, v4
+; GCN1-NEXT: v_not_b32_e32 v3, v0
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB56_1
@@ -2253,19 +2241,19 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dword v0, v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v1, s4
; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v2, s5
; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
-; GCN2-NEXT: v_and_b32_e32 v0, s6, v1
-; GCN2-NEXT: v_mov_b32_e32 v3, s5
-; GCN2-NEXT: v_not_b32_e32 v0, v0
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_mov_b32_e32 v4, v0
+; GCN2-NEXT: v_and_b32_e32 v0, s6, v4
+; GCN2-NEXT: v_not_b32_e32 v3, v0
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB56_1
@@ -2279,19 +2267,19 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
-; GCN3-NEXT: v_and_b32_e32 v0, s6, v1
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: v_not_b32_e32 v0, v0
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_and_b32_e32 v0, s6, v4
+; GCN3-NEXT: v_not_b32_e32 v3, v0
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB56_1
@@ -2308,27 +2296,25 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s34, s4, 16
; GCN1-NEXT: s_addc_u32 s35, s5, 0
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: flat_load_dword v0, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: v_mov_b32_e32 v1, s34
+; GCN1-NEXT: v_mov_b32_e32 v2, s35
+; GCN1-NEXT: flat_load_dword v0, v[1:2]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
; GCN1-NEXT: .LBB57_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: v_mov_b32_e32 v2, s34
-; GCN1-NEXT: v_and_b32_e32 v0, s6, v1
-; GCN1-NEXT: v_mov_b32_e32 v3, s35
-; GCN1-NEXT: v_not_b32_e32 v0, v0
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_mov_b32_e32 v4, v0
+; GCN1-NEXT: v_and_b32_e32 v0, s6, v4
+; GCN1-NEXT: v_not_b32_e32 v3, v0
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB57_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_nand_i32_ret_offset_scalar:
@@ -2336,27 +2322,25 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s34, s4, 16
; GCN2-NEXT: s_addc_u32 s35, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: flat_load_dword v0, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: v_mov_b32_e32 v1, s34
+; GCN2-NEXT: v_mov_b32_e32 v2, s35
+; GCN2-NEXT: flat_load_dword v0, v[1:2]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
; GCN2-NEXT: .LBB57_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: v_mov_b32_e32 v2, s34
-; GCN2-NEXT: v_and_b32_e32 v0, s6, v1
-; GCN2-NEXT: v_mov_b32_e32 v3, s35
-; GCN2-NEXT: v_not_b32_e32 v0, v0
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_mov_b32_e32 v4, v0
+; GCN2-NEXT: v_and_b32_e32 v0, s6, v4
+; GCN2-NEXT: v_not_b32_e32 v3, v0
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB57_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_nand_i32_ret_offset_scalar:
@@ -2365,19 +2349,19 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
-; GCN3-NEXT: v_and_b32_e32 v0, s6, v1
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: v_not_b32_e32 v0, v0
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_and_b32_e32 v0, s6, v4
+; GCN3-NEXT: v_not_b32_e32 v3, v0
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB57_1
@@ -3532,20 +3516,18 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
; GCN1-NEXT: s_mov_b64 s[34:35], 0
; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_max_i32_e32 v0, s6, v1
-; GCN1-NEXT: v_mov_b32_e32 v3, s5
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_max_i32_e32 v2, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB84_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3557,20 +3539,18 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
; GCN2-NEXT: s_mov_b64 s[34:35], 0
; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_max_i32_e32 v0, s6, v1
-; GCN2-NEXT: v_mov_b32_e32 v3, s5
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_max_i32_e32 v2, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB84_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3582,20 +3562,18 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dword v1, v[0:1]
+; GCN3-NEXT: flat_load_dword v3, v[0:1]
; GCN3-NEXT: s_mov_b64 s[34:35], 0
; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_max_i32_e32 v0, s6, v1
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT: v_max_i32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB84_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3613,24 +3591,22 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: s_addc_u32 s35, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s34
; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v2, s34
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_max_i32_e32 v0, s6, v1
-; GCN1-NEXT: v_mov_b32_e32 v3, s35
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_max_i32_e32 v2, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB85_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_max_i32_noret_offset_scalar:
@@ -3640,24 +3616,22 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: s_addc_u32 s35, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s34
; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: flat_load_dword v1, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v2, s34
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_max_i32_e32 v0, s6, v1
-; GCN2-NEXT: v_mov_b32_e32 v3, s35
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_max_i32_e32 v2, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB85_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_max_i32_noret_offset_scalar:
@@ -3665,20 +3639,18 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
+; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[34:35], 0
; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_max_i32_e32 v0, s6, v1
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT: v_max_i32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB85_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3696,18 +3668,18 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: flat_load_dword v0, v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v1, s4
; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v2, s5
; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
-; GCN1-NEXT: v_mov_b32_e32 v3, s5
-; GCN1-NEXT: v_max_i32_e32 v0, s6, v1
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_mov_b32_e32 v4, v0
+; GCN1-NEXT: v_max_i32_e32 v3, s6, v4
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB86_1
@@ -3721,18 +3693,18 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dword v0, v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v1, s4
; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v2, s5
; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
-; GCN2-NEXT: v_mov_b32_e32 v3, s5
-; GCN2-NEXT: v_max_i32_e32 v0, s6, v1
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_mov_b32_e32 v4, v0
+; GCN2-NEXT: v_max_i32_e32 v3, s6, v4
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB86_1
@@ -3746,18 +3718,18 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: v_max_i32_e32 v0, s6, v1
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_max_i32_e32 v3, s6, v4
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB86_1
@@ -3774,26 +3746,24 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s34, s4, 16
; GCN1-NEXT: s_addc_u32 s35, s5, 0
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: flat_load_dword v0, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: v_mov_b32_e32 v1, s34
+; GCN1-NEXT: v_mov_b32_e32 v2, s35
+; GCN1-NEXT: flat_load_dword v0, v[1:2]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: v_mov_b32_e32 v2, s34
-; GCN1-NEXT: v_mov_b32_e32 v3, s35
-; GCN1-NEXT: v_max_i32_e32 v0, s6, v1
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_mov_b32_e32 v4, v0
+; GCN1-NEXT: v_max_i32_e32 v3, s6, v4
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB87_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_max_i32_ret_offset_scalar:
@@ -3801,26 +3771,24 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s34, s4, 16
; GCN2-NEXT: s_addc_u32 s35, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: flat_load_dword v0, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: v_mov_b32_e32 v1, s34
+; GCN2-NEXT: v_mov_b32_e32 v2, s35
+; GCN2-NEXT: flat_load_dword v0, v[1:2]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: v_mov_b32_e32 v2, s34
-; GCN2-NEXT: v_mov_b32_e32 v3, s35
-; GCN2-NEXT: v_max_i32_e32 v0, s6, v1
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_mov_b32_e32 v4, v0
+; GCN2-NEXT: v_max_i32_e32 v3, s6, v4
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB87_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_max_i32_ret_offset_scalar:
@@ -3829,18 +3797,18 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: v_max_i32_e32 v0, s6, v1
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_max_i32_e32 v3, s6, v4
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB87_1
@@ -3866,21 +3834,19 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[0:1], 0
; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_max_i32_e32 v0, s2, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, s0
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_max_i32_e32 v2, s2, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN1-NEXT: s_cbranch_execnz .LBB88_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
@@ -3898,21 +3864,19 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dword v1, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_max_i32_e32 v0, s2, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_max_i32_e32 v2, s2, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB88_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
@@ -3928,21 +3892,19 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_max_i32_e32 v0, s2, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, s0
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT: v_max_i32_e32 v2, s2, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB88_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
@@ -3968,27 +3930,25 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dword v0, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: flat_load_dword v2, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[0:1], 0
; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: v_mov_b32_e32 v3, s1
-; GCN1-NEXT: v_mov_b32_e32 v2, s0
-; GCN1-NEXT: v_max_i32_e32 v0, s6, v1
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: v_max_i32_e32 v2, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN1-NEXT: s_cbranch_execnz .LBB89_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT: v_mov_b32_e32 v1, s2
-; GCN1-NEXT: v_mov_b32_e32 v2, s3
-; GCN1-NEXT: flat_store_dword v[1:2], v0
+; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v0, s2
+; GCN1-NEXT: v_mov_b32_e32 v1, s3
+; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_max_i32_ret_addr64_offset:
@@ -4005,27 +3965,25 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dword v0, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: flat_load_dword v2, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_max_i32_e32 v0, s6, v1
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: v_max_i32_e32 v2, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB89_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT: v_mov_b32_e32 v1, s2
-; GCN2-NEXT: v_mov_b32_e32 v2, s3
-; GCN2-NEXT: flat_store_dword v[1:2], v0
+; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i32_ret_addr64_offset:
@@ -4040,27 +3998,25 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: v_mov_b32_e32 v3, s1
-; GCN3-NEXT: v_mov_b32_e32 v2, s0
-; GCN3-NEXT: v_max_i32_e32 v0, s6, v1
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: v_max_i32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB89_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v1, s2
-; GCN3-NEXT: v_mov_b32_e32 v2, s3
-; GCN3-NEXT: flat_store_dword v[1:2], v0
+; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v0, s2
+; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i32 %index
@@ -4082,21 +4038,19 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
; GCN1-NEXT: s_addc_u32 s1, s1, s5
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[0:1], 0
; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_max_i32_e32 v0, s2, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, s0
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_max_i32_e32 v2, s2, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN1-NEXT: s_cbranch_execnz .LBB90_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
@@ -4112,21 +4066,19 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
; GCN2-NEXT: s_addc_u32 s1, s1, s5
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dword v1, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_max_i32_e32 v0, s2, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_max_i32_e32 v2, s2, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB90_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
@@ -4142,21 +4094,19 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dword v1, v[0:1]
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: flat_load_dword v3, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_max_i32_e32 v0, s2, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, s0
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT: v_max_i32_e32 v2, s2, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB90_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
@@ -4179,27 +4129,25 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GCN1-NEXT: s_addc_u32 s1, s1, s5
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dword v0, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: flat_load_dword v2, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[0:1], 0
; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: v_mov_b32_e32 v3, s1
-; GCN1-NEXT: v_mov_b32_e32 v2, s0
-; GCN1-NEXT: v_max_i32_e32 v0, s6, v1
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: v_max_i32_e32 v2, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN1-NEXT: s_cbranch_execnz .LBB91_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT: v_mov_b32_e32 v1, s2
-; GCN1-NEXT: v_mov_b32_e32 v2, s3
-; GCN1-NEXT: flat_store_dword v[1:2], v0
+; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v0, s2
+; GCN1-NEXT: v_mov_b32_e32 v1, s3
+; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_max_i32_ret_addr64:
@@ -4214,27 +4162,25 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GCN2-NEXT: s_addc_u32 s1, s1, s5
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dword v0, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: flat_load_dword v2, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_max_i32_e32 v0, s6, v1
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: v_max_i32_e32 v2, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB91_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT: v_mov_b32_e32 v1, s2
-; GCN2-NEXT: v_mov_b32_e32 v2, s3
-; GCN2-NEXT: flat_store_dword v[1:2], v0
+; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i32_ret_addr64:
@@ -4249,27 +4195,25 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dword v0, v[0:1]
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: flat_load_dword v2, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: v_mov_b32_e32 v3, s1
-; GCN3-NEXT: v_mov_b32_e32 v2, s0
-; GCN3-NEXT: v_max_i32_e32 v0, s6, v1
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: v_max_i32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB91_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v1, s2
-; GCN3-NEXT: v_mov_b32_e32 v2, s3
-; GCN3-NEXT: flat_store_dword v[1:2], v0
+; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v0, s2
+; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i32 %index
@@ -4715,20 +4659,18 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
; GCN1-NEXT: s_mov_b64 s[34:35], 0
; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_max_u32_e32 v0, s6, v1
-; GCN1-NEXT: v_mov_b32_e32 v3, s5
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_max_u32_e32 v2, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB98_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4740,20 +4682,18 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
; GCN2-NEXT: s_mov_b64 s[34:35], 0
; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_max_u32_e32 v0, s6, v1
-; GCN2-NEXT: v_mov_b32_e32 v3, s5
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_max_u32_e32 v2, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB98_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4765,20 +4705,18 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dword v1, v[0:1]
+; GCN3-NEXT: flat_load_dword v3, v[0:1]
; GCN3-NEXT: s_mov_b64 s[34:35], 0
; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_max_u32_e32 v0, s6, v1
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT: v_max_u32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB98_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4796,24 +4734,22 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: s_addc_u32 s35, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s34
; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v2, s34
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_max_u32_e32 v0, s6, v1
-; GCN1-NEXT: v_mov_b32_e32 v3, s35
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_max_u32_e32 v2, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB99_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_umax_i32_noret_offset_scalar:
@@ -4823,24 +4759,22 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: s_addc_u32 s35, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s34
; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: flat_load_dword v1, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v2, s34
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_max_u32_e32 v0, s6, v1
-; GCN2-NEXT: v_mov_b32_e32 v3, s35
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_max_u32_e32 v2, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB99_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_umax_i32_noret_offset_scalar:
@@ -4848,20 +4782,18 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
+; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[34:35], 0
; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_max_u32_e32 v0, s6, v1
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT: v_max_u32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB99_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4879,18 +4811,18 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: flat_load_dword v0, v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v1, s4
; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v2, s5
; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
-; GCN1-NEXT: v_mov_b32_e32 v3, s5
-; GCN1-NEXT: v_max_u32_e32 v0, s6, v1
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_mov_b32_e32 v4, v0
+; GCN1-NEXT: v_max_u32_e32 v3, s6, v4
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB100_1
@@ -4904,18 +4836,18 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dword v0, v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v1, s4
; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v2, s5
; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
-; GCN2-NEXT: v_mov_b32_e32 v3, s5
-; GCN2-NEXT: v_max_u32_e32 v0, s6, v1
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_mov_b32_e32 v4, v0
+; GCN2-NEXT: v_max_u32_e32 v3, s6, v4
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB100_1
@@ -4929,18 +4861,18 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: v_max_u32_e32 v0, s6, v1
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_max_u32_e32 v3, s6, v4
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB100_1
@@ -4957,26 +4889,24 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s34, s4, 16
; GCN1-NEXT: s_addc_u32 s35, s5, 0
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: flat_load_dword v0, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: v_mov_b32_e32 v1, s34
+; GCN1-NEXT: v_mov_b32_e32 v2, s35
+; GCN1-NEXT: flat_load_dword v0, v[1:2]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: v_mov_b32_e32 v2, s34
-; GCN1-NEXT: v_mov_b32_e32 v3, s35
-; GCN1-NEXT: v_max_u32_e32 v0, s6, v1
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_mov_b32_e32 v4, v0
+; GCN1-NEXT: v_max_u32_e32 v3, s6, v4
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB101_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_umax_i32_ret_offset_scalar:
@@ -4984,26 +4914,24 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s34, s4, 16
; GCN2-NEXT: s_addc_u32 s35, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: flat_load_dword v0, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: v_mov_b32_e32 v1, s34
+; GCN2-NEXT: v_mov_b32_e32 v2, s35
+; GCN2-NEXT: flat_load_dword v0, v[1:2]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: v_mov_b32_e32 v2, s34
-; GCN2-NEXT: v_mov_b32_e32 v3, s35
-; GCN2-NEXT: v_max_u32_e32 v0, s6, v1
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_mov_b32_e32 v4, v0
+; GCN2-NEXT: v_max_u32_e32 v3, s6, v4
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB101_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_umax_i32_ret_offset_scalar:
@@ -5012,18 +4940,18 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: v_max_u32_e32 v0, s6, v1
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_max_u32_e32 v3, s6, v4
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB101_1
@@ -5049,21 +4977,19 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[0:1], 0
; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_max_u32_e32 v0, s2, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, s0
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_max_u32_e32 v2, s2, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN1-NEXT: s_cbranch_execnz .LBB102_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
@@ -5081,21 +5007,19 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dword v1, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_max_u32_e32 v0, s2, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_max_u32_e32 v2, s2, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB102_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
@@ -5111,21 +5035,19 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_max_u32_e32 v0, s2, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, s0
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT: v_max_u32_e32 v2, s2, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB102_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
@@ -5151,27 +5073,25 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dword v0, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: flat_load_dword v2, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[0:1], 0
; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: v_mov_b32_e32 v3, s1
-; GCN1-NEXT: v_mov_b32_e32 v2, s0
-; GCN1-NEXT: v_max_u32_e32 v0, s6, v1
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: v_max_u32_e32 v2, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN1-NEXT: s_cbranch_execnz .LBB103_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT: v_mov_b32_e32 v1, s2
-; GCN1-NEXT: v_mov_b32_e32 v2, s3
-; GCN1-NEXT: flat_store_dword v[1:2], v0
+; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v0, s2
+; GCN1-NEXT: v_mov_b32_e32 v1, s3
+; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset:
@@ -5188,27 +5108,25 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dword v0, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: flat_load_dword v2, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_max_u32_e32 v0, s6, v1
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: v_max_u32_e32 v2, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB103_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT: v_mov_b32_e32 v1, s2
-; GCN2-NEXT: v_mov_b32_e32 v2, s3
-; GCN2-NEXT: flat_store_dword v[1:2], v0
+; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset:
@@ -5223,27 +5141,25 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: v_mov_b32_e32 v3, s1
-; GCN3-NEXT: v_mov_b32_e32 v2, s0
-; GCN3-NEXT: v_max_u32_e32 v0, s6, v1
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: v_max_u32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB103_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v1, s2
-; GCN3-NEXT: v_mov_b32_e32 v2, s3
-; GCN3-NEXT: flat_store_dword v[1:2], v0
+; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v0, s2
+; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i32 %index
@@ -5266,27 +5182,25 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
; GCN1-NEXT: s_addc_u32 s1, s1, s5
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dword v0, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: flat_load_dword v2, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[0:1], 0
; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: v_mov_b32_e32 v3, s1
-; GCN1-NEXT: v_mov_b32_e32 v2, s0
-; GCN1-NEXT: v_max_u32_e32 v0, s6, v1
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: v_max_u32_e32 v2, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN1-NEXT: s_cbranch_execnz .LBB104_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT: v_mov_b32_e32 v1, s2
-; GCN1-NEXT: v_mov_b32_e32 v2, s3
-; GCN1-NEXT: flat_store_dword v[1:2], v0
+; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v0, s2
+; GCN1-NEXT: v_mov_b32_e32 v1, s3
+; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_umax_i32_ret_addr64:
@@ -5301,27 +5215,25 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
; GCN2-NEXT: s_addc_u32 s1, s1, s5
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dword v0, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: flat_load_dword v2, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_max_u32_e32 v0, s6, v1
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: v_max_u32_e32 v2, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB104_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT: v_mov_b32_e32 v1, s2
-; GCN2-NEXT: v_mov_b32_e32 v2, s3
-; GCN2-NEXT: flat_store_dword v[1:2], v0
+; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i32_ret_addr64:
@@ -5336,27 +5248,25 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dword v0, v[0:1]
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: flat_load_dword v2, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: v_mov_b32_e32 v3, s1
-; GCN3-NEXT: v_mov_b32_e32 v2, s0
-; GCN3-NEXT: v_max_u32_e32 v0, s6, v1
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: v_max_u32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB104_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v1, s2
-; GCN3-NEXT: v_mov_b32_e32 v2, s3
-; GCN3-NEXT: flat_store_dword v[1:2], v0
+; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v0, s2
+; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i32 %index
@@ -5802,20 +5712,18 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
; GCN1-NEXT: s_mov_b64 s[34:35], 0
; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_min_u32_e32 v0, s6, v1
-; GCN1-NEXT: v_mov_b32_e32 v3, s5
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_min_u32_e32 v2, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB111_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5827,20 +5735,18 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
; GCN2-NEXT: s_mov_b64 s[34:35], 0
; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_min_u32_e32 v0, s6, v1
-; GCN2-NEXT: v_mov_b32_e32 v3, s5
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_min_u32_e32 v2, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB111_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5852,20 +5758,18 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dword v1, v[0:1]
+; GCN3-NEXT: flat_load_dword v3, v[0:1]
; GCN3-NEXT: s_mov_b64 s[34:35], 0
; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_min_u32_e32 v0, s6, v1
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT: v_min_u32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB111_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5883,24 +5787,22 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: s_addc_u32 s35, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s34
; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v2, s34
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_min_u32_e32 v0, s6, v1
-; GCN1-NEXT: v_mov_b32_e32 v3, s35
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_min_u32_e32 v2, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB112_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_umin_i32_noret_offset_scalar:
@@ -5910,24 +5812,22 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: s_addc_u32 s35, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s34
; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: flat_load_dword v1, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v2, s34
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_min_u32_e32 v0, s6, v1
-; GCN2-NEXT: v_mov_b32_e32 v3, s35
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_min_u32_e32 v2, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB112_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_umin_i32_noret_offset_scalar:
@@ -5935,20 +5835,18 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
+; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[34:35], 0
; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_min_u32_e32 v0, s6, v1
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT: v_min_u32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB112_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5966,18 +5864,18 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: flat_load_dword v0, v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v1, s4
; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v2, s5
; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
-; GCN1-NEXT: v_mov_b32_e32 v3, s5
-; GCN1-NEXT: v_min_u32_e32 v0, s6, v1
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_mov_b32_e32 v4, v0
+; GCN1-NEXT: v_min_u32_e32 v3, s6, v4
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB113_1
@@ -5991,18 +5889,18 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dword v0, v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v1, s4
; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v2, s5
; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
-; GCN2-NEXT: v_mov_b32_e32 v3, s5
-; GCN2-NEXT: v_min_u32_e32 v0, s6, v1
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_mov_b32_e32 v4, v0
+; GCN2-NEXT: v_min_u32_e32 v3, s6, v4
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB113_1
@@ -6016,18 +5914,18 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: v_min_u32_e32 v0, s6, v1
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_min_u32_e32 v3, s6, v4
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB113_1
@@ -6044,26 +5942,24 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s34, s4, 16
; GCN1-NEXT: s_addc_u32 s35, s5, 0
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: flat_load_dword v0, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: v_mov_b32_e32 v1, s34
+; GCN1-NEXT: v_mov_b32_e32 v2, s35
+; GCN1-NEXT: flat_load_dword v0, v[1:2]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: v_mov_b32_e32 v2, s34
-; GCN1-NEXT: v_mov_b32_e32 v3, s35
-; GCN1-NEXT: v_min_u32_e32 v0, s6, v1
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_mov_b32_e32 v4, v0
+; GCN1-NEXT: v_min_u32_e32 v3, s6, v4
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB114_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_umin_i32_ret_offset_scalar:
@@ -6071,26 +5967,24 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s34, s4, 16
; GCN2-NEXT: s_addc_u32 s35, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: flat_load_dword v0, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: v_mov_b32_e32 v1, s34
+; GCN2-NEXT: v_mov_b32_e32 v2, s35
+; GCN2-NEXT: flat_load_dword v0, v[1:2]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: v_mov_b32_e32 v2, s34
-; GCN2-NEXT: v_mov_b32_e32 v3, s35
-; GCN2-NEXT: v_min_u32_e32 v0, s6, v1
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_mov_b32_e32 v4, v0
+; GCN2-NEXT: v_min_u32_e32 v3, s6, v4
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB114_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_umin_i32_ret_offset_scalar:
@@ -6099,18 +5993,18 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: v_min_u32_e32 v0, s6, v1
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_min_u32_e32 v3, s6, v4
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB114_1
@@ -6559,20 +6453,18 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
; GCN1-NEXT: s_mov_b64 s[34:35], 0
; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_min_i32_e32 v0, s6, v1
-; GCN1-NEXT: v_mov_b32_e32 v3, s5
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_min_i32_e32 v2, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB121_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6584,20 +6476,18 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
; GCN2-NEXT: s_mov_b64 s[34:35], 0
; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_min_i32_e32 v0, s6, v1
-; GCN2-NEXT: v_mov_b32_e32 v3, s5
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_min_i32_e32 v2, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB121_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6609,20 +6499,18 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dword v1, v[0:1]
+; GCN3-NEXT: flat_load_dword v3, v[0:1]
; GCN3-NEXT: s_mov_b64 s[34:35], 0
; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_min_i32_e32 v0, s6, v1
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT: v_min_i32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB121_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6640,24 +6528,22 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: s_addc_u32 s35, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s34
; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v2, s34
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_min_i32_e32 v0, s6, v1
-; GCN1-NEXT: v_mov_b32_e32 v3, s35
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_min_i32_e32 v2, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB122_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_min_i32_noret_offset_scalar:
@@ -6667,24 +6553,22 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: s_addc_u32 s35, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s34
; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: flat_load_dword v1, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v2, s34
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_min_i32_e32 v0, s6, v1
-; GCN2-NEXT: v_mov_b32_e32 v3, s35
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_min_i32_e32 v2, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB122_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_min_i32_noret_offset_scalar:
@@ -6692,20 +6576,18 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
+; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
; GCN3-NEXT: s_mov_b64 s[34:35], 0
; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_min_i32_e32 v0, s6, v1
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT: v_min_i32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB122_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6723,18 +6605,18 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: flat_load_dword v0, v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v1, s4
; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v2, s5
; GCN1-NEXT: .LBB123_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
-; GCN1-NEXT: v_mov_b32_e32 v3, s5
-; GCN1-NEXT: v_min_i32_e32 v0, s6, v1
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_mov_b32_e32 v4, v0
+; GCN1-NEXT: v_min_i32_e32 v3, s6, v4
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB123_1
@@ -6748,18 +6630,18 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dword v0, v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v1, s4
; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v2, s5
; GCN2-NEXT: .LBB123_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
-; GCN2-NEXT: v_mov_b32_e32 v3, s5
-; GCN2-NEXT: v_min_i32_e32 v0, s6, v1
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_mov_b32_e32 v4, v0
+; GCN2-NEXT: v_min_i32_e32 v3, s6, v4
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB123_1
@@ -6773,18 +6655,18 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
; GCN3-NEXT: .LBB123_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: v_min_i32_e32 v0, s6, v1
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_min_i32_e32 v3, s6, v4
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB123_1
@@ -6801,26 +6683,24 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s34, s4, 16
; GCN1-NEXT: s_addc_u32 s35, s5, 0
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: flat_load_dword v0, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: v_mov_b32_e32 v1, s34
+; GCN1-NEXT: v_mov_b32_e32 v2, s35
+; GCN1-NEXT: flat_load_dword v0, v[1:2]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
; GCN1-NEXT: .LBB124_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: v_mov_b32_e32 v2, s34
-; GCN1-NEXT: v_mov_b32_e32 v3, s35
-; GCN1-NEXT: v_min_i32_e32 v0, s6, v1
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_mov_b32_e32 v4, v0
+; GCN1-NEXT: v_min_i32_e32 v3, s6, v4
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB124_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_min_i32_ret_offset_scalar:
@@ -6828,26 +6708,24 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s34, s4, 16
; GCN2-NEXT: s_addc_u32 s35, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: flat_load_dword v0, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: v_mov_b32_e32 v1, s34
+; GCN2-NEXT: v_mov_b32_e32 v2, s35
+; GCN2-NEXT: flat_load_dword v0, v[1:2]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
; GCN2-NEXT: .LBB124_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: v_mov_b32_e32 v2, s34
-; GCN2-NEXT: v_mov_b32_e32 v3, s35
-; GCN2-NEXT: v_min_i32_e32 v0, s6, v1
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_mov_b32_e32 v4, v0
+; GCN2-NEXT: v_min_i32_e32 v3, s6, v4
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB124_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_min_i32_ret_offset_scalar:
@@ -6856,18 +6734,18 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
; GCN3-NEXT: .LBB124_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: v_min_i32_e32 v0, s6, v1
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_min_i32_e32 v3, s6, v4
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB124_1
@@ -6893,21 +6771,19 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[0:1], 0
; GCN1-NEXT: .LBB125_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_min_i32_e32 v0, s2, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, s0
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_min_i32_e32 v2, s2, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN1-NEXT: s_cbranch_execnz .LBB125_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
@@ -6925,21 +6801,19 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dword v1, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_min_i32_e32 v0, s2, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_min_i32_e32 v2, s2, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB125_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
@@ -6955,21 +6829,19 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_min_i32_e32 v0, s2, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, s0
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT: v_min_i32_e32 v2, s2, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB125_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
@@ -6995,27 +6867,25 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dword v0, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: flat_load_dword v2, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[0:1], 0
; GCN1-NEXT: .LBB126_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: v_mov_b32_e32 v3, s1
-; GCN1-NEXT: v_mov_b32_e32 v2, s0
-; GCN1-NEXT: v_min_i32_e32 v0, s6, v1
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: v_min_i32_e32 v2, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN1-NEXT: s_cbranch_execnz .LBB126_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT: v_mov_b32_e32 v1, s2
-; GCN1-NEXT: v_mov_b32_e32 v2, s3
-; GCN1-NEXT: flat_store_dword v[1:2], v0
+; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v0, s2
+; GCN1-NEXT: v_mov_b32_e32 v1, s3
+; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_min_i32_ret_addr64_offset:
@@ -7032,27 +6902,25 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dword v0, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: flat_load_dword v2, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_min_i32_e32 v0, s6, v1
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: v_min_i32_e32 v2, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB126_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT: v_mov_b32_e32 v1, s2
-; GCN2-NEXT: v_mov_b32_e32 v2, s3
-; GCN2-NEXT: flat_store_dword v[1:2], v0
+; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i32_ret_addr64_offset:
@@ -7067,27 +6935,25 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: v_mov_b32_e32 v3, s1
-; GCN3-NEXT: v_mov_b32_e32 v2, s0
-; GCN3-NEXT: v_min_i32_e32 v0, s6, v1
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: v_min_i32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB126_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v1, s2
-; GCN3-NEXT: v_mov_b32_e32 v2, s3
-; GCN3-NEXT: flat_store_dword v[1:2], v0
+; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v0, s2
+; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i32 %index
@@ -7100,78 +6966,72 @@ entry:
define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) {
; GCN1-LABEL: atomic_min_i32:
; GCN1: ; %bb.0: ; %entry
-; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb
-; GCN1-NEXT: s_mov_b64 s[2:3], 0
+; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x9
+; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb
+; GCN1-NEXT: s_mov_b64 s[0:1], 0
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v0, s6
+; GCN1-NEXT: v_mov_b32_e32 v1, s7
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
; GCN1-NEXT: .LBB127_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_min_i32_e32 v0, s4, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, s0
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_min_i32_e32 v2, s2, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN1-NEXT: s_cbranch_execnz .LBB127_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_min_i32:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c
-; GCN2-NEXT: s_mov_b64 s[2:3], 0
+; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_min_i32_e32 v0, s4, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_min_i32_e32 v2, s2, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB127_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i32:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GCN3-NEXT: s_mov_b64 s[2:3], 0
+; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dword v1, v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
+; GCN3-NEXT: flat_load_dword v3, v[0:1]
; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_min_i32_e32 v0, s6, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, s0
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT: v_min_i32_e32 v2, s2, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB127_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
@@ -7193,27 +7053,25 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GCN1-NEXT: s_addc_u32 s1, s1, s5
; GCN1-NEXT: v_mov_b32_e32 v0, s0
; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dword v0, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: flat_load_dword v2, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[0:1], 0
; GCN1-NEXT: .LBB128_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: v_mov_b32_e32 v3, s1
-; GCN1-NEXT: v_mov_b32_e32 v2, s0
-; GCN1-NEXT: v_min_i32_e32 v0, s6, v1
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: v_min_i32_e32 v2, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN1-NEXT: s_cbranch_execnz .LBB128_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT: v_mov_b32_e32 v1, s2
-; GCN1-NEXT: v_mov_b32_e32 v2, s3
-; GCN1-NEXT: flat_store_dword v[1:2], v0
+; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v0, s2
+; GCN1-NEXT: v_mov_b32_e32 v1, s3
+; GCN1-NEXT: flat_store_dword v[0:1], v2
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_min_i32_ret_addr64:
@@ -7228,27 +7086,25 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GCN2-NEXT: s_addc_u32 s1, s1, s5
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dword v0, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: flat_load_dword v2, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_min_i32_e32 v0, s6, v1
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: v_min_i32_e32 v2, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB128_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT: v_mov_b32_e32 v1, s2
-; GCN2-NEXT: v_mov_b32_e32 v2, s3
-; GCN2-NEXT: flat_store_dword v[1:2], v0
+; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i32_ret_addr64:
@@ -7263,27 +7119,25 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GCN3-NEXT: s_addc_u32 s1, s1, s5
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dword v0, v[0:1]
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: flat_load_dword v2, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: v_mov_b32_e32 v3, s1
-; GCN3-NEXT: v_mov_b32_e32 v2, s0
-; GCN3-NEXT: v_min_i32_e32 v0, s6, v1
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: v_min_i32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB128_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v1, s2
-; GCN3-NEXT: v_mov_b32_e32 v2, s3
-; GCN3-NEXT: flat_store_dword v[1:2], v0
+; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v0, s2
+; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i32 %index
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
index 66ac4d2198ea55..36bddb7ac2fd68 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
@@ -6998,8 +6998,6 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_and_b32_e32 v0, s7, v3
; GCN1-NEXT: v_and_b32_e32 v6, s6, v2
-; GCN1-NEXT: v_mov_b32_e32 v4, s4
-; GCN1-NEXT: v_mov_b32_e32 v5, s5
; GCN1-NEXT: v_not_b32_e32 v1, v0
; GCN1-NEXT: v_not_b32_e32 v0, v6
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -7065,8 +7063,6 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_and_b32_e32 v0, s7, v3
; GCN2-NEXT: v_and_b32_e32 v6, s6, v2
-; GCN2-NEXT: v_mov_b32_e32 v4, s4
-; GCN2-NEXT: v_mov_b32_e32 v5, s5
; GCN2-NEXT: v_not_b32_e32 v1, v0
; GCN2-NEXT: v_not_b32_e32 v0, v6
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -7115,17 +7111,15 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
; GCN3-NEXT: .LBB54_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB54_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v4, s4
+; GCN3-NEXT: v_mov_b32_e32 v5, s5
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN3-NEXT: s_mov_b64 s[34:35], 0
; GCN3-NEXT: .LBB54_4: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_and_b32_e32 v0, s7, v3
; GCN3-NEXT: v_and_b32_e32 v6, s6, v2
-; GCN3-NEXT: v_mov_b32_e32 v4, s4
-; GCN3-NEXT: v_mov_b32_e32 v5, s5
; GCN3-NEXT: v_not_b32_e32 v1, v0
; GCN3-NEXT: v_not_b32_e32 v0, v6
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -7194,8 +7188,6 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_and_b32_e32 v0, s7, v3
; GCN1-NEXT: v_and_b32_e32 v6, s6, v2
-; GCN1-NEXT: v_mov_b32_e32 v4, s34
-; GCN1-NEXT: v_mov_b32_e32 v5, s35
; GCN1-NEXT: v_not_b32_e32 v1, v0
; GCN1-NEXT: v_not_b32_e32 v0, v6
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -7263,8 +7255,6 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_and_b32_e32 v0, s7, v3
; GCN2-NEXT: v_and_b32_e32 v6, s6, v2
-; GCN2-NEXT: v_mov_b32_e32 v4, s34
-; GCN2-NEXT: v_mov_b32_e32 v5, s35
; GCN2-NEXT: v_not_b32_e32 v1, v0
; GCN2-NEXT: v_not_b32_e32 v0, v6
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -7315,17 +7305,15 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: .LBB55_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB55_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v4, s34
+; GCN3-NEXT: v_mov_b32_e32 v5, s35
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN3-NEXT: s_mov_b64 s[36:37], 0
; GCN3-NEXT: .LBB55_4: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_and_b32_e32 v0, s7, v3
; GCN3-NEXT: v_and_b32_e32 v6, s6, v2
-; GCN3-NEXT: v_mov_b32_e32 v4, s34
-; GCN3-NEXT: v_mov_b32_e32 v5, s35
; GCN3-NEXT: v_not_b32_e32 v1, v0
; GCN3-NEXT: v_not_b32_e32 v0, v6
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -7385,18 +7373,16 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN1-NEXT: .LBB56_2: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v3, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, v0
-; GCN1-NEXT: v_mov_b32_e32 v4, s4
-; GCN1-NEXT: v_and_b32_e32 v0, s7, v3
-; GCN1-NEXT: v_and_b32_e32 v6, s6, v2
-; GCN1-NEXT: v_mov_b32_e32 v5, s5
-; GCN1-NEXT: v_not_b32_e32 v1, v0
-; GCN1-NEXT: v_not_b32_e32 v0, v6
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT: v_mov_b32_e32 v7, v1
+; GCN1-NEXT: v_mov_b32_e32 v6, v0
+; GCN1-NEXT: v_and_b32_e32 v0, s7, v7
+; GCN1-NEXT: v_and_b32_e32 v1, s6, v6
+; GCN1-NEXT: v_not_b32_e32 v5, v0
+; GCN1-NEXT: v_not_b32_e32 v4, v1
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB56_2
@@ -7450,18 +7436,16 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN2-NEXT: .LBB56_2: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v3, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: v_mov_b32_e32 v4, s4
-; GCN2-NEXT: v_and_b32_e32 v0, s7, v3
-; GCN2-NEXT: v_and_b32_e32 v6, s6, v2
-; GCN2-NEXT: v_mov_b32_e32 v5, s5
-; GCN2-NEXT: v_not_b32_e32 v1, v0
-; GCN2-NEXT: v_not_b32_e32 v0, v6
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT: v_mov_b32_e32 v7, v1
+; GCN2-NEXT: v_mov_b32_e32 v6, v0
+; GCN2-NEXT: v_and_b32_e32 v0, s7, v7
+; GCN2-NEXT: v_and_b32_e32 v1, s6, v6
+; GCN2-NEXT: v_not_b32_e32 v5, v0
+; GCN2-NEXT: v_not_b32_e32 v4, v1
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB56_2
@@ -7500,25 +7484,23 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN3-NEXT: s_cbranch_vccz .LBB56_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v2, s4
+; GCN3-NEXT: v_mov_b32_e32 v3, s5
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_mov_b64 s[34:35], 0
; GCN3-NEXT: .LBB56_2: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v3, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: v_mov_b32_e32 v4, s4
-; GCN3-NEXT: v_and_b32_e32 v0, s7, v3
-; GCN3-NEXT: v_and_b32_e32 v6, s6, v2
-; GCN3-NEXT: v_mov_b32_e32 v5, s5
-; GCN3-NEXT: v_not_b32_e32 v1, v0
-; GCN3-NEXT: v_not_b32_e32 v0, v6
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v7, v1
+; GCN3-NEXT: v_mov_b32_e32 v6, v0
+; GCN3-NEXT: v_and_b32_e32 v0, s7, v7
+; GCN3-NEXT: v_and_b32_e32 v1, s6, v6
+; GCN3-NEXT: v_not_b32_e32 v5, v0
+; GCN3-NEXT: v_not_b32_e32 v4, v1
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB56_2
@@ -7575,18 +7557,16 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
; GCN1-NEXT: .LBB57_2: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v3, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, v0
-; GCN1-NEXT: v_mov_b32_e32 v4, s34
-; GCN1-NEXT: v_and_b32_e32 v0, s7, v3
-; GCN1-NEXT: v_and_b32_e32 v6, s6, v2
-; GCN1-NEXT: v_mov_b32_e32 v5, s35
-; GCN1-NEXT: v_not_b32_e32 v1, v0
-; GCN1-NEXT: v_not_b32_e32 v0, v6
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT: v_mov_b32_e32 v7, v1
+; GCN1-NEXT: v_mov_b32_e32 v6, v0
+; GCN1-NEXT: v_and_b32_e32 v0, s7, v7
+; GCN1-NEXT: v_and_b32_e32 v1, s6, v6
+; GCN1-NEXT: v_not_b32_e32 v5, v0
+; GCN1-NEXT: v_not_b32_e32 v4, v1
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
; GCN1-NEXT: s_cbranch_execnz .LBB57_2
@@ -7642,18 +7622,16 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
; GCN2-NEXT: .LBB57_2: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v3, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: v_mov_b32_e32 v4, s34
-; GCN2-NEXT: v_and_b32_e32 v0, s7, v3
-; GCN2-NEXT: v_and_b32_e32 v6, s6, v2
-; GCN2-NEXT: v_mov_b32_e32 v5, s35
-; GCN2-NEXT: v_not_b32_e32 v1, v0
-; GCN2-NEXT: v_not_b32_e32 v0, v6
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT: v_mov_b32_e32 v7, v1
+; GCN2-NEXT: v_mov_b32_e32 v6, v0
+; GCN2-NEXT: v_and_b32_e32 v0, s7, v7
+; GCN2-NEXT: v_and_b32_e32 v1, s6, v6
+; GCN2-NEXT: v_not_b32_e32 v5, v0
+; GCN2-NEXT: v_not_b32_e32 v4, v1
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
; GCN2-NEXT: s_cbranch_execnz .LBB57_2
@@ -7694,25 +7672,23 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
; GCN3-NEXT: s_cbranch_vccz .LBB57_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v2, s34
+; GCN3-NEXT: v_mov_b32_e32 v3, s35
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_mov_b64 s[36:37], 0
; GCN3-NEXT: .LBB57_2: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v3, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: v_mov_b32_e32 v4, s34
-; GCN3-NEXT: v_and_b32_e32 v0, s7, v3
-; GCN3-NEXT: v_and_b32_e32 v6, s6, v2
-; GCN3-NEXT: v_mov_b32_e32 v5, s35
-; GCN3-NEXT: v_not_b32_e32 v1, v0
-; GCN3-NEXT: v_not_b32_e32 v0, v6
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v7, v1
+; GCN3-NEXT: v_mov_b32_e32 v6, v0
+; GCN3-NEXT: v_and_b32_e32 v0, s7, v7
+; GCN3-NEXT: v_and_b32_e32 v1, s6, v6
+; GCN3-NEXT: v_not_b32_e32 v5, v0
+; GCN3-NEXT: v_not_b32_e32 v4, v1
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
; GCN3-NEXT: s_cbranch_execnz .LBB57_2
@@ -11446,16 +11422,14 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN1-NEXT: flat_load_dword v3, v[0:1]
; GCN1-NEXT: flat_load_dword v2, v[4:5]
; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v6, s7
+; GCN1-NEXT: v_mov_b32_e32 v7, s6
; GCN1-NEXT: .LBB84_4: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s7
-; GCN1-NEXT: v_mov_b32_e32 v6, s6
-; GCN1-NEXT: v_mov_b32_e32 v4, s4
-; GCN1-NEXT: v_mov_b32_e32 v5, s5
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
@@ -11514,16 +11488,14 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN2-NEXT: flat_load_dword v3, v[0:1]
; GCN2-NEXT: flat_load_dword v2, v[4:5]
; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
; GCN2-NEXT: .LBB84_4: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s7
-; GCN2-NEXT: v_mov_b32_e32 v6, s6
-; GCN2-NEXT: v_mov_b32_e32 v4, s4
-; GCN2-NEXT: v_mov_b32_e32 v5, s5
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -11570,20 +11542,18 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN3-NEXT: .LBB84_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB84_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v4, s4
+; GCN3-NEXT: v_mov_b32_e32 v5, s5
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v6, s7
+; GCN3-NEXT: v_mov_b32_e32 v7, s6
; GCN3-NEXT: .LBB84_4: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s7
-; GCN3-NEXT: v_mov_b32_e32 v6, s6
-; GCN3-NEXT: v_mov_b32_e32 v4, s4
-; GCN3-NEXT: v_mov_b32_e32 v5, s5
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
@@ -11645,16 +11615,14 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: flat_load_dword v3, v[0:1]
; GCN1-NEXT: flat_load_dword v2, v[4:5]
; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: v_mov_b32_e32 v6, s7
+; GCN1-NEXT: v_mov_b32_e32 v7, s6
; GCN1-NEXT: .LBB85_4: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s7
-; GCN1-NEXT: v_mov_b32_e32 v6, s6
-; GCN1-NEXT: v_mov_b32_e32 v4, s34
-; GCN1-NEXT: v_mov_b32_e32 v5, s35
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
@@ -11715,16 +11683,14 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: flat_load_dword v3, v[0:1]
; GCN2-NEXT: flat_load_dword v2, v[4:5]
; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
; GCN2-NEXT: .LBB85_4: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s7
-; GCN2-NEXT: v_mov_b32_e32 v6, s6
-; GCN2-NEXT: v_mov_b32_e32 v4, s34
-; GCN2-NEXT: v_mov_b32_e32 v5, s35
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -11773,20 +11739,18 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: .LBB85_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB85_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v4, s34
+; GCN3-NEXT: v_mov_b32_e32 v5, s35
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: v_mov_b32_e32 v6, s7
+; GCN3-NEXT: v_mov_b32_e32 v7, s6
; GCN3-NEXT: .LBB85_4: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s7
-; GCN3-NEXT: v_mov_b32_e32 v6, s6
-; GCN3-NEXT: v_mov_b32_e32 v4, s34
-; GCN3-NEXT: v_mov_b32_e32 v5, s35
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
@@ -11841,22 +11805,20 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: flat_load_dword v0, v[2:3]
; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v4, s7
+; GCN1-NEXT: v_mov_b32_e32 v5, s6
; GCN1-NEXT: .LBB86_2: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v3, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, v0
-; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s7
-; GCN1-NEXT: v_mov_b32_e32 v6, s6
-; GCN1-NEXT: v_mov_b32_e32 v4, s4
-; GCN1-NEXT: v_mov_b32_e32 v5, s5
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB86_2
@@ -11907,22 +11869,20 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: flat_load_dword v0, v[2:3]
; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v4, s7
+; GCN2-NEXT: v_mov_b32_e32 v5, s6
; GCN2-NEXT: .LBB86_2: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v3, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s7
-; GCN2-NEXT: v_mov_b32_e32 v6, s6
-; GCN2-NEXT: v_mov_b32_e32 v4, s4
-; GCN2-NEXT: v_mov_b32_e32 v5, s5
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB86_2
@@ -11961,26 +11921,24 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN3-NEXT: s_cbranch_vccz .LBB86_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v2, s4
+; GCN3-NEXT: v_mov_b32_e32 v3, s5
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v4, s7
+; GCN3-NEXT: v_mov_b32_e32 v5, s6
; GCN3-NEXT: .LBB86_2: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v3, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s7
-; GCN3-NEXT: v_mov_b32_e32 v6, s6
-; GCN3-NEXT: v_mov_b32_e32 v4, s4
-; GCN3-NEXT: v_mov_b32_e32 v5, s5
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB86_2
@@ -12034,22 +11992,20 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: flat_load_dword v0, v[2:3]
; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: v_mov_b32_e32 v4, s7
+; GCN1-NEXT: v_mov_b32_e32 v5, s6
; GCN1-NEXT: .LBB87_2: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v3, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, v0
-; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s7
-; GCN1-NEXT: v_mov_b32_e32 v6, s6
-; GCN1-NEXT: v_mov_b32_e32 v4, s34
-; GCN1-NEXT: v_mov_b32_e32 v5, s35
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
; GCN1-NEXT: s_cbranch_execnz .LBB87_2
@@ -12102,22 +12058,20 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: flat_load_dword v0, v[2:3]
; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: v_mov_b32_e32 v4, s7
+; GCN2-NEXT: v_mov_b32_e32 v5, s6
; GCN2-NEXT: .LBB87_2: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v3, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s7
-; GCN2-NEXT: v_mov_b32_e32 v6, s6
-; GCN2-NEXT: v_mov_b32_e32 v4, s34
-; GCN2-NEXT: v_mov_b32_e32 v5, s35
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
; GCN2-NEXT: s_cbranch_execnz .LBB87_2
@@ -12158,26 +12112,24 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
; GCN3-NEXT: s_cbranch_vccz .LBB87_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v2, s34
+; GCN3-NEXT: v_mov_b32_e32 v3, s35
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: v_mov_b32_e32 v4, s7
+; GCN3-NEXT: v_mov_b32_e32 v5, s6
; GCN3-NEXT: .LBB87_2: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v3, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s7
-; GCN3-NEXT: v_mov_b32_e32 v6, s6
-; GCN3-NEXT: v_mov_b32_e32 v4, s34
-; GCN3-NEXT: v_mov_b32_e32 v5, s35
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
; GCN3-NEXT: s_cbranch_execnz .LBB87_2
@@ -12238,20 +12190,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN1-NEXT: .LBB88_2: ; %atomicrmw.phi
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB88_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v5, s1
+; GCN1-NEXT: v_mov_b32_e32 v4, s0
+; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: v_mov_b32_e32 v6, s3
+; GCN1-NEXT: v_mov_b32_e32 v7, s2
; GCN1-NEXT: .LBB88_4: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s3
-; GCN1-NEXT: v_mov_b32_e32 v6, s2
-; GCN1-NEXT: v_mov_b32_e32 v5, s1
-; GCN1-NEXT: v_mov_b32_e32 v4, s0
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
@@ -12311,20 +12261,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-NEXT: .LBB88_2: ; %atomicrmw.phi
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB88_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
+; GCN2-NEXT: v_mov_b32_e32 v4, s0
+; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: v_mov_b32_e32 v6, s3
+; GCN2-NEXT: v_mov_b32_e32 v7, s2
; GCN2-NEXT: .LBB88_4: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s3
-; GCN2-NEXT: v_mov_b32_e32 v6, s2
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s0
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -12383,20 +12331,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN3-NEXT: .LBB88_2: ; %atomicrmw.phi
; GCN3-NEXT: s_endpgm
; GCN3-NEXT: .LBB88_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v5, s1
+; GCN3-NEXT: v_mov_b32_e32 v4, s0
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: v_mov_b32_e32 v6, s3
+; GCN3-NEXT: v_mov_b32_e32 v7, s2
; GCN3-NEXT: .LBB88_4: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s3
-; GCN3-NEXT: v_mov_b32_e32 v6, s2
-; GCN3-NEXT: v_mov_b32_e32 v5, s1
-; GCN3-NEXT: v_mov_b32_e32 v4, s0
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
@@ -12453,26 +12399,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GCN1-NEXT: s_cbranch_vccz .LBB89_4
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v3, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s0
+; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
+; GCN1-NEXT: v_mov_b32_e32 v4, s13
+; GCN1-NEXT: v_mov_b32_e32 v5, s12
; GCN1-NEXT: .LBB89_2: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v3, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, v0
-; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s13
-; GCN1-NEXT: v_mov_b32_e32 v6, s12
-; GCN1-NEXT: v_mov_b32_e32 v5, s1
-; GCN1-NEXT: v_mov_b32_e32 v4, s0
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
+; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB89_2
@@ -12526,26 +12470,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GCN2-NEXT: s_cbranch_vccz .LBB89_4
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s0
+; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
+; GCN2-NEXT: v_mov_b32_e32 v4, s13
+; GCN2-NEXT: v_mov_b32_e32 v5, s12
; GCN2-NEXT: .LBB89_2: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v3, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s13
-; GCN2-NEXT: v_mov_b32_e32 v6, s12
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s0
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
+; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB89_2
@@ -12598,26 +12540,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GCN3-NEXT: s_cbranch_vccz .LBB89_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v3, s1
+; GCN3-NEXT: v_mov_b32_e32 v2, s0
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_mov_b64 s[2:3], 0
+; GCN3-NEXT: v_mov_b32_e32 v4, s13
+; GCN3-NEXT: v_mov_b32_e32 v5, s12
; GCN3-NEXT: .LBB89_2: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v3, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s13
-; GCN3-NEXT: v_mov_b32_e32 v6, s12
-; GCN3-NEXT: v_mov_b32_e32 v5, s1
-; GCN3-NEXT: v_mov_b32_e32 v4, s0
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB89_2
@@ -12681,20 +12621,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN1-NEXT: .LBB90_2: ; %atomicrmw.phi
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB90_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v5, s1
+; GCN1-NEXT: v_mov_b32_e32 v4, s0
+; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: v_mov_b32_e32 v6, s3
+; GCN1-NEXT: v_mov_b32_e32 v7, s2
; GCN1-NEXT: .LBB90_4: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s3
-; GCN1-NEXT: v_mov_b32_e32 v6, s2
-; GCN1-NEXT: v_mov_b32_e32 v5, s1
-; GCN1-NEXT: v_mov_b32_e32 v4, s0
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
@@ -12752,20 +12690,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-NEXT: .LBB90_2: ; %atomicrmw.phi
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB90_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
+; GCN2-NEXT: v_mov_b32_e32 v4, s0
+; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: v_mov_b32_e32 v6, s3
+; GCN2-NEXT: v_mov_b32_e32 v7, s2
; GCN2-NEXT: .LBB90_4: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s3
-; GCN2-NEXT: v_mov_b32_e32 v6, s2
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s0
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -12822,20 +12758,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN3-NEXT: .LBB90_2: ; %atomicrmw.phi
; GCN3-NEXT: s_endpgm
; GCN3-NEXT: .LBB90_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v5, s1
+; GCN3-NEXT: v_mov_b32_e32 v4, s0
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: v_mov_b32_e32 v6, s3
+; GCN3-NEXT: v_mov_b32_e32 v7, s2
; GCN3-NEXT: .LBB90_4: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s3
-; GCN3-NEXT: v_mov_b32_e32 v6, s2
-; GCN3-NEXT: v_mov_b32_e32 v5, s1
-; GCN3-NEXT: v_mov_b32_e32 v4, s0
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
@@ -12889,26 +12823,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GCN1-NEXT: s_cbranch_vccz .LBB91_4
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v3, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s0
+; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
+; GCN1-NEXT: v_mov_b32_e32 v4, s13
+; GCN1-NEXT: v_mov_b32_e32 v5, s12
; GCN1-NEXT: .LBB91_2: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v3, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, v0
-; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s13
-; GCN1-NEXT: v_mov_b32_e32 v6, s12
-; GCN1-NEXT: v_mov_b32_e32 v5, s1
-; GCN1-NEXT: v_mov_b32_e32 v4, s0
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
+; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB91_2
@@ -12960,26 +12892,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GCN2-NEXT: s_cbranch_vccz .LBB91_4
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s0
+; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
+; GCN2-NEXT: v_mov_b32_e32 v4, s13
+; GCN2-NEXT: v_mov_b32_e32 v5, s12
; GCN2-NEXT: .LBB91_2: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v3, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s13
-; GCN2-NEXT: v_mov_b32_e32 v6, s12
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s0
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
+; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB91_2
@@ -13030,26 +12960,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GCN3-NEXT: s_cbranch_vccz .LBB91_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v3, s1
+; GCN3-NEXT: v_mov_b32_e32 v2, s0
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_mov_b64 s[2:3], 0
+; GCN3-NEXT: v_mov_b32_e32 v4, s13
+; GCN3-NEXT: v_mov_b32_e32 v5, s12
; GCN3-NEXT: .LBB91_2: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v3, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s13
-; GCN3-NEXT: v_mov_b32_e32 v6, s12
-; GCN3-NEXT: v_mov_b32_e32 v5, s1
-; GCN3-NEXT: v_mov_b32_e32 v4, s0
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB91_2
@@ -14197,16 +14125,14 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
; GCN1-NEXT: flat_load_dword v3, v[0:1]
; GCN1-NEXT: flat_load_dword v2, v[4:5]
; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v6, s7
+; GCN1-NEXT: v_mov_b32_e32 v7, s6
; GCN1-NEXT: .LBB98_4: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s7
-; GCN1-NEXT: v_mov_b32_e32 v6, s6
-; GCN1-NEXT: v_mov_b32_e32 v4, s4
-; GCN1-NEXT: v_mov_b32_e32 v5, s5
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
@@ -14265,16 +14191,14 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
; GCN2-NEXT: flat_load_dword v3, v[0:1]
; GCN2-NEXT: flat_load_dword v2, v[4:5]
; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
; GCN2-NEXT: .LBB98_4: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s7
-; GCN2-NEXT: v_mov_b32_e32 v6, s6
-; GCN2-NEXT: v_mov_b32_e32 v4, s4
-; GCN2-NEXT: v_mov_b32_e32 v5, s5
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -14321,20 +14245,18 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
; GCN3-NEXT: .LBB98_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB98_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v4, s4
+; GCN3-NEXT: v_mov_b32_e32 v5, s5
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v6, s7
+; GCN3-NEXT: v_mov_b32_e32 v7, s6
; GCN3-NEXT: .LBB98_4: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s7
-; GCN3-NEXT: v_mov_b32_e32 v6, s6
-; GCN3-NEXT: v_mov_b32_e32 v4, s4
-; GCN3-NEXT: v_mov_b32_e32 v5, s5
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
@@ -14396,16 +14318,14 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: flat_load_dword v3, v[0:1]
; GCN1-NEXT: flat_load_dword v2, v[4:5]
; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: v_mov_b32_e32 v6, s7
+; GCN1-NEXT: v_mov_b32_e32 v7, s6
; GCN1-NEXT: .LBB99_4: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s7
-; GCN1-NEXT: v_mov_b32_e32 v6, s6
-; GCN1-NEXT: v_mov_b32_e32 v4, s34
-; GCN1-NEXT: v_mov_b32_e32 v5, s35
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
@@ -14466,16 +14386,14 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: flat_load_dword v3, v[0:1]
; GCN2-NEXT: flat_load_dword v2, v[4:5]
; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
; GCN2-NEXT: .LBB99_4: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s7
-; GCN2-NEXT: v_mov_b32_e32 v6, s6
-; GCN2-NEXT: v_mov_b32_e32 v4, s34
-; GCN2-NEXT: v_mov_b32_e32 v5, s35
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -14524,20 +14442,18 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: .LBB99_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB99_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v4, s34
+; GCN3-NEXT: v_mov_b32_e32 v5, s35
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: v_mov_b32_e32 v6, s7
+; GCN3-NEXT: v_mov_b32_e32 v7, s6
; GCN3-NEXT: .LBB99_4: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s7
-; GCN3-NEXT: v_mov_b32_e32 v6, s6
-; GCN3-NEXT: v_mov_b32_e32 v4, s34
-; GCN3-NEXT: v_mov_b32_e32 v5, s35
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
@@ -14592,22 +14508,20 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: flat_load_dword v0, v[2:3]
; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v4, s7
+; GCN1-NEXT: v_mov_b32_e32 v5, s6
; GCN1-NEXT: .LBB100_2: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v3, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, v0
-; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s7
-; GCN1-NEXT: v_mov_b32_e32 v6, s6
-; GCN1-NEXT: v_mov_b32_e32 v4, s4
-; GCN1-NEXT: v_mov_b32_e32 v5, s5
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB100_2
@@ -14658,22 +14572,20 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: flat_load_dword v0, v[2:3]
; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v4, s7
+; GCN2-NEXT: v_mov_b32_e32 v5, s6
; GCN2-NEXT: .LBB100_2: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v3, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s7
-; GCN2-NEXT: v_mov_b32_e32 v6, s6
-; GCN2-NEXT: v_mov_b32_e32 v4, s4
-; GCN2-NEXT: v_mov_b32_e32 v5, s5
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB100_2
@@ -14712,26 +14624,24 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN3-NEXT: s_cbranch_vccz .LBB100_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v2, s4
+; GCN3-NEXT: v_mov_b32_e32 v3, s5
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v4, s7
+; GCN3-NEXT: v_mov_b32_e32 v5, s6
; GCN3-NEXT: .LBB100_2: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v3, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s7
-; GCN3-NEXT: v_mov_b32_e32 v6, s6
-; GCN3-NEXT: v_mov_b32_e32 v4, s4
-; GCN3-NEXT: v_mov_b32_e32 v5, s5
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB100_2
@@ -14785,22 +14695,20 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: flat_load_dword v0, v[2:3]
; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: v_mov_b32_e32 v4, s7
+; GCN1-NEXT: v_mov_b32_e32 v5, s6
; GCN1-NEXT: .LBB101_2: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v3, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, v0
-; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s7
-; GCN1-NEXT: v_mov_b32_e32 v6, s6
-; GCN1-NEXT: v_mov_b32_e32 v4, s34
-; GCN1-NEXT: v_mov_b32_e32 v5, s35
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
; GCN1-NEXT: s_cbranch_execnz .LBB101_2
@@ -14853,22 +14761,20 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: flat_load_dword v0, v[2:3]
; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: v_mov_b32_e32 v4, s7
+; GCN2-NEXT: v_mov_b32_e32 v5, s6
; GCN2-NEXT: .LBB101_2: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v3, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s7
-; GCN2-NEXT: v_mov_b32_e32 v6, s6
-; GCN2-NEXT: v_mov_b32_e32 v4, s34
-; GCN2-NEXT: v_mov_b32_e32 v5, s35
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
; GCN2-NEXT: s_cbranch_execnz .LBB101_2
@@ -14909,26 +14815,24 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
; GCN3-NEXT: s_cbranch_vccz .LBB101_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v2, s34
+; GCN3-NEXT: v_mov_b32_e32 v3, s35
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: v_mov_b32_e32 v4, s7
+; GCN3-NEXT: v_mov_b32_e32 v5, s6
; GCN3-NEXT: .LBB101_2: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v3, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s7
-; GCN3-NEXT: v_mov_b32_e32 v6, s6
-; GCN3-NEXT: v_mov_b32_e32 v4, s34
-; GCN3-NEXT: v_mov_b32_e32 v5, s35
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
; GCN3-NEXT: s_cbranch_execnz .LBB101_2
@@ -14989,20 +14893,18 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
; GCN1-NEXT: .LBB102_2: ; %atomicrmw.phi
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB102_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v5, s1
+; GCN1-NEXT: v_mov_b32_e32 v4, s0
+; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: v_mov_b32_e32 v6, s3
+; GCN1-NEXT: v_mov_b32_e32 v7, s2
; GCN1-NEXT: .LBB102_4: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s3
-; GCN1-NEXT: v_mov_b32_e32 v6, s2
-; GCN1-NEXT: v_mov_b32_e32 v5, s1
-; GCN1-NEXT: v_mov_b32_e32 v4, s0
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
@@ -15062,20 +14964,18 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
; GCN2-NEXT: .LBB102_2: ; %atomicrmw.phi
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB102_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
+; GCN2-NEXT: v_mov_b32_e32 v4, s0
+; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: v_mov_b32_e32 v6, s3
+; GCN2-NEXT: v_mov_b32_e32 v7, s2
; GCN2-NEXT: .LBB102_4: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s3
-; GCN2-NEXT: v_mov_b32_e32 v6, s2
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s0
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -15134,20 +15034,18 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
; GCN3-NEXT: .LBB102_2: ; %atomicrmw.phi
; GCN3-NEXT: s_endpgm
; GCN3-NEXT: .LBB102_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v5, s1
+; GCN3-NEXT: v_mov_b32_e32 v4, s0
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: v_mov_b32_e32 v6, s3
+; GCN3-NEXT: v_mov_b32_e32 v7, s2
; GCN3-NEXT: .LBB102_4: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s3
-; GCN3-NEXT: v_mov_b32_e32 v6, s2
-; GCN3-NEXT: v_mov_b32_e32 v5, s1
-; GCN3-NEXT: v_mov_b32_e32 v4, s0
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
@@ -15204,26 +15102,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GCN1-NEXT: s_cbranch_vccz .LBB103_4
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v3, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s0
+; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
+; GCN1-NEXT: v_mov_b32_e32 v4, s13
+; GCN1-NEXT: v_mov_b32_e32 v5, s12
; GCN1-NEXT: .LBB103_2: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v3, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, v0
-; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s13
-; GCN1-NEXT: v_mov_b32_e32 v6, s12
-; GCN1-NEXT: v_mov_b32_e32 v5, s1
-; GCN1-NEXT: v_mov_b32_e32 v4, s0
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
+; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB103_2
@@ -15277,26 +15173,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GCN2-NEXT: s_cbranch_vccz .LBB103_4
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s0
+; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
+; GCN2-NEXT: v_mov_b32_e32 v4, s13
+; GCN2-NEXT: v_mov_b32_e32 v5, s12
; GCN2-NEXT: .LBB103_2: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v3, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s13
-; GCN2-NEXT: v_mov_b32_e32 v6, s12
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s0
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
+; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB103_2
@@ -15349,26 +15243,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GCN3-NEXT: s_cbranch_vccz .LBB103_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v3, s1
+; GCN3-NEXT: v_mov_b32_e32 v2, s0
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_mov_b64 s[2:3], 0
+; GCN3-NEXT: v_mov_b32_e32 v4, s13
+; GCN3-NEXT: v_mov_b32_e32 v5, s12
; GCN3-NEXT: .LBB103_2: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v3, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s13
-; GCN3-NEXT: v_mov_b32_e32 v6, s12
-; GCN3-NEXT: v_mov_b32_e32 v5, s1
-; GCN3-NEXT: v_mov_b32_e32 v4, s0
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB103_2
@@ -15425,26 +15317,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GCN1-NEXT: s_cbranch_vccz .LBB104_4
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v3, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s0
+; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
+; GCN1-NEXT: v_mov_b32_e32 v4, s13
+; GCN1-NEXT: v_mov_b32_e32 v5, s12
; GCN1-NEXT: .LBB104_2: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v3, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, v0
-; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s13
-; GCN1-NEXT: v_mov_b32_e32 v6, s12
-; GCN1-NEXT: v_mov_b32_e32 v5, s1
-; GCN1-NEXT: v_mov_b32_e32 v4, s0
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
+; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB104_2
@@ -15496,26 +15386,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GCN2-NEXT: s_cbranch_vccz .LBB104_4
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s0
+; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
+; GCN2-NEXT: v_mov_b32_e32 v4, s13
+; GCN2-NEXT: v_mov_b32_e32 v5, s12
; GCN2-NEXT: .LBB104_2: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v3, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s13
-; GCN2-NEXT: v_mov_b32_e32 v6, s12
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s0
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
+; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB104_2
@@ -15566,26 +15454,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GCN3-NEXT: s_cbranch_vccz .LBB104_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v3, s1
+; GCN3-NEXT: v_mov_b32_e32 v2, s0
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_mov_b64 s[2:3], 0
+; GCN3-NEXT: v_mov_b32_e32 v4, s13
+; GCN3-NEXT: v_mov_b32_e32 v5, s12
; GCN3-NEXT: .LBB104_2: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v3, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s13
-; GCN3-NEXT: v_mov_b32_e32 v6, s12
-; GCN3-NEXT: v_mov_b32_e32 v5, s1
-; GCN3-NEXT: v_mov_b32_e32 v4, s0
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB104_2
@@ -16733,16 +16619,14 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
; GCN1-NEXT: flat_load_dword v3, v[0:1]
; GCN1-NEXT: flat_load_dword v2, v[4:5]
; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v6, s7
+; GCN1-NEXT: v_mov_b32_e32 v7, s6
; GCN1-NEXT: .LBB111_4: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s7
-; GCN1-NEXT: v_mov_b32_e32 v6, s6
-; GCN1-NEXT: v_mov_b32_e32 v4, s4
-; GCN1-NEXT: v_mov_b32_e32 v5, s5
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
@@ -16801,16 +16685,14 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
; GCN2-NEXT: flat_load_dword v3, v[0:1]
; GCN2-NEXT: flat_load_dword v2, v[4:5]
; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
; GCN2-NEXT: .LBB111_4: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s7
-; GCN2-NEXT: v_mov_b32_e32 v6, s6
-; GCN2-NEXT: v_mov_b32_e32 v4, s4
-; GCN2-NEXT: v_mov_b32_e32 v5, s5
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -16857,20 +16739,18 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
; GCN3-NEXT: .LBB111_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB111_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v4, s4
+; GCN3-NEXT: v_mov_b32_e32 v5, s5
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v6, s7
+; GCN3-NEXT: v_mov_b32_e32 v7, s6
; GCN3-NEXT: .LBB111_4: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s7
-; GCN3-NEXT: v_mov_b32_e32 v6, s6
-; GCN3-NEXT: v_mov_b32_e32 v4, s4
-; GCN3-NEXT: v_mov_b32_e32 v5, s5
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
@@ -16932,16 +16812,14 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: flat_load_dword v3, v[0:1]
; GCN1-NEXT: flat_load_dword v2, v[4:5]
; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: v_mov_b32_e32 v6, s7
+; GCN1-NEXT: v_mov_b32_e32 v7, s6
; GCN1-NEXT: .LBB112_4: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s7
-; GCN1-NEXT: v_mov_b32_e32 v6, s6
-; GCN1-NEXT: v_mov_b32_e32 v4, s34
-; GCN1-NEXT: v_mov_b32_e32 v5, s35
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
@@ -17002,16 +16880,14 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: flat_load_dword v3, v[0:1]
; GCN2-NEXT: flat_load_dword v2, v[4:5]
; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
; GCN2-NEXT: .LBB112_4: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s7
-; GCN2-NEXT: v_mov_b32_e32 v6, s6
-; GCN2-NEXT: v_mov_b32_e32 v4, s34
-; GCN2-NEXT: v_mov_b32_e32 v5, s35
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -17060,20 +16936,18 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: .LBB112_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB112_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v4, s34
+; GCN3-NEXT: v_mov_b32_e32 v5, s35
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: v_mov_b32_e32 v6, s7
+; GCN3-NEXT: v_mov_b32_e32 v7, s6
; GCN3-NEXT: .LBB112_4: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s7
-; GCN3-NEXT: v_mov_b32_e32 v6, s6
-; GCN3-NEXT: v_mov_b32_e32 v4, s34
-; GCN3-NEXT: v_mov_b32_e32 v5, s35
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
@@ -17128,22 +17002,20 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: flat_load_dword v0, v[2:3]
; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v4, s7
+; GCN1-NEXT: v_mov_b32_e32 v5, s6
; GCN1-NEXT: .LBB113_2: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v3, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, v0
-; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s7
-; GCN1-NEXT: v_mov_b32_e32 v6, s6
-; GCN1-NEXT: v_mov_b32_e32 v4, s4
-; GCN1-NEXT: v_mov_b32_e32 v5, s5
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB113_2
@@ -17194,22 +17066,20 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: flat_load_dword v0, v[2:3]
; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v4, s7
+; GCN2-NEXT: v_mov_b32_e32 v5, s6
; GCN2-NEXT: .LBB113_2: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v3, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s7
-; GCN2-NEXT: v_mov_b32_e32 v6, s6
-; GCN2-NEXT: v_mov_b32_e32 v4, s4
-; GCN2-NEXT: v_mov_b32_e32 v5, s5
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB113_2
@@ -17248,26 +17118,24 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN3-NEXT: s_cbranch_vccz .LBB113_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v2, s4
+; GCN3-NEXT: v_mov_b32_e32 v3, s5
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v4, s7
+; GCN3-NEXT: v_mov_b32_e32 v5, s6
; GCN3-NEXT: .LBB113_2: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v3, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s7
-; GCN3-NEXT: v_mov_b32_e32 v6, s6
-; GCN3-NEXT: v_mov_b32_e32 v4, s4
-; GCN3-NEXT: v_mov_b32_e32 v5, s5
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB113_2
@@ -17321,22 +17189,20 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: flat_load_dword v0, v[2:3]
; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: v_mov_b32_e32 v4, s7
+; GCN1-NEXT: v_mov_b32_e32 v5, s6
; GCN1-NEXT: .LBB114_2: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v3, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, v0
-; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s7
-; GCN1-NEXT: v_mov_b32_e32 v6, s6
-; GCN1-NEXT: v_mov_b32_e32 v4, s34
-; GCN1-NEXT: v_mov_b32_e32 v5, s35
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
; GCN1-NEXT: s_cbranch_execnz .LBB114_2
@@ -17389,22 +17255,20 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: flat_load_dword v0, v[2:3]
; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: v_mov_b32_e32 v4, s7
+; GCN2-NEXT: v_mov_b32_e32 v5, s6
; GCN2-NEXT: .LBB114_2: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v3, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s7
-; GCN2-NEXT: v_mov_b32_e32 v6, s6
-; GCN2-NEXT: v_mov_b32_e32 v4, s34
-; GCN2-NEXT: v_mov_b32_e32 v5, s35
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
; GCN2-NEXT: s_cbranch_execnz .LBB114_2
@@ -17445,26 +17309,24 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
; GCN3-NEXT: s_cbranch_vccz .LBB114_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v2, s34
+; GCN3-NEXT: v_mov_b32_e32 v3, s35
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: v_mov_b32_e32 v4, s7
+; GCN3-NEXT: v_mov_b32_e32 v5, s6
; GCN3-NEXT: .LBB114_2: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v3, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s7
-; GCN3-NEXT: v_mov_b32_e32 v6, s6
-; GCN3-NEXT: v_mov_b32_e32 v4, s34
-; GCN3-NEXT: v_mov_b32_e32 v5, s35
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
; GCN3-NEXT: s_cbranch_execnz .LBB114_2
@@ -18608,16 +18470,14 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN1-NEXT: flat_load_dword v3, v[0:1]
; GCN1-NEXT: flat_load_dword v2, v[4:5]
; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v6, s7
+; GCN1-NEXT: v_mov_b32_e32 v7, s6
; GCN1-NEXT: .LBB121_4: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s7
-; GCN1-NEXT: v_mov_b32_e32 v6, s6
-; GCN1-NEXT: v_mov_b32_e32 v4, s4
-; GCN1-NEXT: v_mov_b32_e32 v5, s5
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
@@ -18676,16 +18536,14 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN2-NEXT: flat_load_dword v3, v[0:1]
; GCN2-NEXT: flat_load_dword v2, v[4:5]
; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
; GCN2-NEXT: .LBB121_4: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s7
-; GCN2-NEXT: v_mov_b32_e32 v6, s6
-; GCN2-NEXT: v_mov_b32_e32 v4, s4
-; GCN2-NEXT: v_mov_b32_e32 v5, s5
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -18732,20 +18590,18 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN3-NEXT: .LBB121_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB121_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v4, s4
+; GCN3-NEXT: v_mov_b32_e32 v5, s5
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v6, s7
+; GCN3-NEXT: v_mov_b32_e32 v7, s6
; GCN3-NEXT: .LBB121_4: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s7
-; GCN3-NEXT: v_mov_b32_e32 v6, s6
-; GCN3-NEXT: v_mov_b32_e32 v4, s4
-; GCN3-NEXT: v_mov_b32_e32 v5, s5
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
@@ -18807,16 +18663,14 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: flat_load_dword v3, v[0:1]
; GCN1-NEXT: flat_load_dword v2, v[4:5]
; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: v_mov_b32_e32 v6, s7
+; GCN1-NEXT: v_mov_b32_e32 v7, s6
; GCN1-NEXT: .LBB122_4: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s7
-; GCN1-NEXT: v_mov_b32_e32 v6, s6
-; GCN1-NEXT: v_mov_b32_e32 v4, s34
-; GCN1-NEXT: v_mov_b32_e32 v5, s35
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
@@ -18877,16 +18731,14 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: flat_load_dword v3, v[0:1]
; GCN2-NEXT: flat_load_dword v2, v[4:5]
; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
; GCN2-NEXT: .LBB122_4: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s7
-; GCN2-NEXT: v_mov_b32_e32 v6, s6
-; GCN2-NEXT: v_mov_b32_e32 v4, s34
-; GCN2-NEXT: v_mov_b32_e32 v5, s35
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -18935,20 +18787,18 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: .LBB122_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB122_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v4, s34
+; GCN3-NEXT: v_mov_b32_e32 v5, s35
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: v_mov_b32_e32 v6, s7
+; GCN3-NEXT: v_mov_b32_e32 v7, s6
; GCN3-NEXT: .LBB122_4: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s7
-; GCN3-NEXT: v_mov_b32_e32 v6, s6
-; GCN3-NEXT: v_mov_b32_e32 v4, s34
-; GCN3-NEXT: v_mov_b32_e32 v5, s35
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
@@ -19003,22 +18853,20 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: flat_load_dword v0, v[2:3]
; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v4, s7
+; GCN1-NEXT: v_mov_b32_e32 v5, s6
; GCN1-NEXT: .LBB123_2: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v3, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, v0
-; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s7
-; GCN1-NEXT: v_mov_b32_e32 v6, s6
-; GCN1-NEXT: v_mov_b32_e32 v4, s4
-; GCN1-NEXT: v_mov_b32_e32 v5, s5
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB123_2
@@ -19069,22 +18917,20 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: flat_load_dword v0, v[2:3]
; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v4, s7
+; GCN2-NEXT: v_mov_b32_e32 v5, s6
; GCN2-NEXT: .LBB123_2: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v3, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s7
-; GCN2-NEXT: v_mov_b32_e32 v6, s6
-; GCN2-NEXT: v_mov_b32_e32 v4, s4
-; GCN2-NEXT: v_mov_b32_e32 v5, s5
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB123_2
@@ -19123,26 +18969,24 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN3-NEXT: s_cbranch_vccz .LBB123_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v2, s4
+; GCN3-NEXT: v_mov_b32_e32 v3, s5
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v4, s7
+; GCN3-NEXT: v_mov_b32_e32 v5, s6
; GCN3-NEXT: .LBB123_2: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v3, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s7
-; GCN3-NEXT: v_mov_b32_e32 v6, s6
-; GCN3-NEXT: v_mov_b32_e32 v4, s4
-; GCN3-NEXT: v_mov_b32_e32 v5, s5
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB123_2
@@ -19196,22 +19040,20 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN1-NEXT: flat_load_dword v1, v[0:1]
; GCN1-NEXT: flat_load_dword v0, v[2:3]
; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: v_mov_b32_e32 v4, s7
+; GCN1-NEXT: v_mov_b32_e32 v5, s6
; GCN1-NEXT: .LBB124_2: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v3, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, v0
-; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s7
-; GCN1-NEXT: v_mov_b32_e32 v6, s6
-; GCN1-NEXT: v_mov_b32_e32 v4, s34
-; GCN1-NEXT: v_mov_b32_e32 v5, s35
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
; GCN1-NEXT: s_cbranch_execnz .LBB124_2
@@ -19264,22 +19106,20 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN2-NEXT: flat_load_dword v1, v[0:1]
; GCN2-NEXT: flat_load_dword v0, v[2:3]
; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: v_mov_b32_e32 v4, s7
+; GCN2-NEXT: v_mov_b32_e32 v5, s6
; GCN2-NEXT: .LBB124_2: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v3, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s7
-; GCN2-NEXT: v_mov_b32_e32 v6, s6
-; GCN2-NEXT: v_mov_b32_e32 v4, s34
-; GCN2-NEXT: v_mov_b32_e32 v5, s35
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
; GCN2-NEXT: s_cbranch_execnz .LBB124_2
@@ -19320,26 +19160,24 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
; GCN3-NEXT: s_cbranch_vccz .LBB124_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v2, s34
+; GCN3-NEXT: v_mov_b32_e32 v3, s35
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: v_mov_b32_e32 v4, s7
+; GCN3-NEXT: v_mov_b32_e32 v5, s6
; GCN3-NEXT: .LBB124_2: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v3, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s7
-; GCN3-NEXT: v_mov_b32_e32 v6, s6
-; GCN3-NEXT: v_mov_b32_e32 v4, s34
-; GCN3-NEXT: v_mov_b32_e32 v5, s35
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
; GCN3-NEXT: s_cbranch_execnz .LBB124_2
@@ -19400,20 +19238,18 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN1-NEXT: .LBB125_2: ; %atomicrmw.phi
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB125_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v5, s1
+; GCN1-NEXT: v_mov_b32_e32 v4, s0
+; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: v_mov_b32_e32 v6, s3
+; GCN1-NEXT: v_mov_b32_e32 v7, s2
; GCN1-NEXT: .LBB125_4: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s3
-; GCN1-NEXT: v_mov_b32_e32 v6, s2
-; GCN1-NEXT: v_mov_b32_e32 v5, s1
-; GCN1-NEXT: v_mov_b32_e32 v4, s0
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
@@ -19473,20 +19309,18 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-NEXT: .LBB125_2: ; %atomicrmw.phi
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB125_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
+; GCN2-NEXT: v_mov_b32_e32 v4, s0
+; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: v_mov_b32_e32 v6, s3
+; GCN2-NEXT: v_mov_b32_e32 v7, s2
; GCN2-NEXT: .LBB125_4: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s3
-; GCN2-NEXT: v_mov_b32_e32 v6, s2
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s0
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -19545,20 +19379,18 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN3-NEXT: .LBB125_2: ; %atomicrmw.phi
; GCN3-NEXT: s_endpgm
; GCN3-NEXT: .LBB125_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v5, s1
+; GCN3-NEXT: v_mov_b32_e32 v4, s0
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: v_mov_b32_e32 v6, s3
+; GCN3-NEXT: v_mov_b32_e32 v7, s2
; GCN3-NEXT: .LBB125_4: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s3
-; GCN3-NEXT: v_mov_b32_e32 v6, s2
-; GCN3-NEXT: v_mov_b32_e32 v5, s1
-; GCN3-NEXT: v_mov_b32_e32 v4, s0
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
@@ -19615,26 +19447,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GCN1-NEXT: s_cbranch_vccz .LBB126_4
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v3, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s0
+; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
+; GCN1-NEXT: v_mov_b32_e32 v4, s13
+; GCN1-NEXT: v_mov_b32_e32 v5, s12
; GCN1-NEXT: .LBB126_2: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v3, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, v0
-; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s13
-; GCN1-NEXT: v_mov_b32_e32 v6, s12
-; GCN1-NEXT: v_mov_b32_e32 v5, s1
-; GCN1-NEXT: v_mov_b32_e32 v4, s0
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
+; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB126_2
@@ -19688,26 +19518,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GCN2-NEXT: s_cbranch_vccz .LBB126_4
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s0
+; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
+; GCN2-NEXT: v_mov_b32_e32 v4, s13
+; GCN2-NEXT: v_mov_b32_e32 v5, s12
; GCN2-NEXT: .LBB126_2: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v3, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s13
-; GCN2-NEXT: v_mov_b32_e32 v6, s12
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s0
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
+; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB126_2
@@ -19760,26 +19588,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GCN3-NEXT: s_cbranch_vccz .LBB126_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v3, s1
+; GCN3-NEXT: v_mov_b32_e32 v2, s0
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_mov_b64 s[2:3], 0
+; GCN3-NEXT: v_mov_b32_e32 v4, s13
+; GCN3-NEXT: v_mov_b32_e32 v5, s12
; GCN3-NEXT: .LBB126_2: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v3, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s13
-; GCN3-NEXT: v_mov_b32_e32 v6, s12
-; GCN3-NEXT: v_mov_b32_e32 v5, s1
-; GCN3-NEXT: v_mov_b32_e32 v4, s0
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB126_2
@@ -19839,20 +19665,18 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GCN1-NEXT: .LBB127_2: ; %atomicrmw.phi
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB127_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v5, s1
+; GCN1-NEXT: v_mov_b32_e32 v4, s0
+; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: v_mov_b32_e32 v6, s3
+; GCN1-NEXT: v_mov_b32_e32 v7, s2
; GCN1-NEXT: .LBB127_4: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s3
-; GCN1-NEXT: v_mov_b32_e32 v6, s2
-; GCN1-NEXT: v_mov_b32_e32 v5, s1
-; GCN1-NEXT: v_mov_b32_e32 v4, s0
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
@@ -19906,20 +19730,18 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GCN2-NEXT: .LBB127_2: ; %atomicrmw.phi
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB127_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
+; GCN2-NEXT: v_mov_b32_e32 v4, s0
+; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: v_mov_b32_e32 v6, s3
+; GCN2-NEXT: v_mov_b32_e32 v7, s2
; GCN2-NEXT: .LBB127_4: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s3
-; GCN2-NEXT: v_mov_b32_e32 v6, s2
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s0
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -19972,20 +19794,18 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GCN3-NEXT: .LBB127_2: ; %atomicrmw.phi
; GCN3-NEXT: s_endpgm
; GCN3-NEXT: .LBB127_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v5, s1
+; GCN3-NEXT: v_mov_b32_e32 v4, s0
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: v_mov_b32_e32 v6, s3
+; GCN3-NEXT: v_mov_b32_e32 v7, s2
; GCN3-NEXT: .LBB127_4: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s3
-; GCN3-NEXT: v_mov_b32_e32 v6, s2
-; GCN3-NEXT: v_mov_b32_e32 v5, s1
-; GCN3-NEXT: v_mov_b32_e32 v4, s0
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
@@ -20038,26 +19858,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GCN1-NEXT: s_cbranch_vccz .LBB128_4
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v1, s1
-; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v3, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s0
+; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN1-NEXT: s_mov_b64 s[2:3], 0
+; GCN1-NEXT: v_mov_b32_e32 v4, s13
+; GCN1-NEXT: v_mov_b32_e32 v5, s12
; GCN1-NEXT: .LBB128_2: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v3, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, v0
-; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3]
-; GCN1-NEXT: v_mov_b32_e32 v0, s13
-; GCN1-NEXT: v_mov_b32_e32 v6, s12
-; GCN1-NEXT: v_mov_b32_e32 v5, s1
-; GCN1-NEXT: v_mov_b32_e32 v4, s0
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
+; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN1-NEXT: s_cbranch_execnz .LBB128_2
@@ -20109,26 +19927,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GCN2-NEXT: s_cbranch_vccz .LBB128_4
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s0
+; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_mov_b64 s[2:3], 0
+; GCN2-NEXT: v_mov_b32_e32 v4, s13
+; GCN2-NEXT: v_mov_b32_e32 v5, s12
; GCN2-NEXT: .LBB128_2: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v3, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3]
-; GCN2-NEXT: v_mov_b32_e32 v0, s13
-; GCN2-NEXT: v_mov_b32_e32 v6, s12
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s0
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
+; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN2-NEXT: s_cbranch_execnz .LBB128_2
@@ -20179,26 +19995,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GCN3-NEXT: s_cbranch_vccz .LBB128_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
-; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v3, s1
+; GCN3-NEXT: v_mov_b32_e32 v2, s0
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_mov_b64 s[2:3], 0
+; GCN3-NEXT: v_mov_b32_e32 v4, s13
+; GCN3-NEXT: v_mov_b32_e32 v5, s12
; GCN3-NEXT: .LBB128_2: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v3, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3]
-; GCN3-NEXT: v_mov_b32_e32 v0, s13
-; GCN3-NEXT: v_mov_b32_e32 v6, s12
-; GCN3-NEXT: v_mov_b32_e32 v5, s1
-; GCN3-NEXT: v_mov_b32_e32 v4, s0
-; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN3-NEXT: s_cbranch_execnz .LBB128_2
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
index 4a4fcfe5afcc88..fe47461ebf9569 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
@@ -2187,14 +2187,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
; GFX7-NEXT: v_mov_b32_e32 v4, s35
; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: flat_load_dword v3, v[3:4]
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v5, s5
; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v0, s7, v3
; GFX7-NEXT: v_and_b32_e32 v6, s6, v2
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-NEXT: v_mov_b32_e32 v5, s5
; GFX7-NEXT: v_not_b32_e32 v1, v0
; GFX7-NEXT: v_not_b32_e32 v0, v6
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -2221,14 +2221,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
; GFX8-NEXT: v_mov_b32_e32 v4, s35
; GFX8-NEXT: flat_load_dword v2, v[0:1]
; GFX8-NEXT: flat_load_dword v3, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, s7, v3
; GFX8-NEXT: v_and_b32_e32 v6, s6, v2
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: v_not_b32_e32 v1, v0
; GFX8-NEXT: v_not_b32_e32 v0, v6
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -2250,14 +2250,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, s7, v3
; GFX9-NEXT: v_and_b32_e32 v6, s6, v2
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: v_not_b32_e32 v1, v0
; GFX9-NEXT: v_not_b32_e32 v0, v6
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -2290,14 +2290,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
; GFX7-NEXT: v_mov_b32_e32 v5, s35
; GFX7-NEXT: flat_load_dword v3, v[0:1]
; GFX7-NEXT: flat_load_dword v2, v[4:5]
-; GFX7-NEXT: s_mov_b64 s[36:37], 0
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v0, s7, v3
; GFX7-NEXT: v_and_b32_e32 v6, s6, v2
-; GFX7-NEXT: v_mov_b32_e32 v4, s34
-; GFX7-NEXT: v_mov_b32_e32 v5, s35
; GFX7-NEXT: v_not_b32_e32 v1, v0
; GFX7-NEXT: v_not_b32_e32 v0, v6
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -2305,12 +2303,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_cbranch_execnz .LBB55_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[36:37]
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
@@ -2326,14 +2324,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
; GFX8-NEXT: v_mov_b32_e32 v5, s35
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: flat_load_dword v2, v[4:5]
-; GFX8-NEXT: s_mov_b64 s[36:37], 0
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, s7, v3
; GFX8-NEXT: v_and_b32_e32 v6, s6, v2
-; GFX8-NEXT: v_mov_b32_e32 v4, s34
-; GFX8-NEXT: v_mov_b32_e32 v5, s35
; GFX8-NEXT: v_not_b32_e32 v1, v0
; GFX8-NEXT: v_not_b32_e32 v0, v6
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -2341,12 +2337,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_cbranch_execnz .LBB55_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[36:37]
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
@@ -2355,14 +2351,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, s7, v3
; GFX9-NEXT: v_and_b32_e32 v6, s6, v2
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: v_not_b32_e32 v1, v0
; GFX9-NEXT: v_not_b32_e32 v0, v6
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
@@ -2394,22 +2390,22 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_load_dword v0, v[0:1]
; GFX7-NEXT: flat_load_dword v1, v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-NEXT: v_and_b32_e32 v0, s7, v3
-; GFX7-NEXT: v_and_b32_e32 v6, s6, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-NEXT: v_not_b32_e32 v1, v0
-; GFX7-NEXT: v_not_b32_e32 v0, v6
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_and_b32_e32 v0, s7, v7
+; GFX7-NEXT: v_and_b32_e32 v1, s6, v6
+; GFX7-NEXT: v_not_b32_e32 v5, v0
+; GFX7-NEXT: v_not_b32_e32 v4, v1
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_cbranch_execnz .LBB56_1
@@ -2428,22 +2424,22 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: flat_load_dword v1, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_and_b32_e32 v0, s7, v3
-; GFX8-NEXT: v_and_b32_e32 v6, s6, v2
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NEXT: v_not_b32_e32 v1, v0
-; GFX8-NEXT: v_not_b32_e32 v0, v6
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_and_b32_e32 v0, s7, v7
+; GFX8-NEXT: v_and_b32_e32 v1, s6, v6
+; GFX8-NEXT: v_not_b32_e32 v5, v0
+; GFX8-NEXT: v_not_b32_e32 v4, v1
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_cbranch_execnz .LBB56_1
@@ -2457,22 +2453,22 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_and_b32_e32 v0, s7, v3
-; GFX9-NEXT: v_and_b32_e32 v6, s6, v2
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_not_b32_e32 v1, v0
-; GFX9-NEXT: v_not_b32_e32 v0, v6
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT: v_mov_b32_e32 v7, v1
+; GFX9-NEXT: v_mov_b32_e32 v6, v0
+; GFX9-NEXT: v_and_b32_e32 v0, s7, v7
+; GFX9-NEXT: v_and_b32_e32 v1, s6, v6
+; GFX9-NEXT: v_not_b32_e32 v5, v0
+; GFX9-NEXT: v_not_b32_e32 v4, v1
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_cbranch_execnz .LBB56_1
@@ -2497,27 +2493,25 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_load_dword v1, v[0:1]
; GFX7-NEXT: flat_load_dword v0, v[2:3]
-; GFX7-NEXT: s_mov_b64 s[36:37], 0
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: v_mov_b32_e32 v4, s34
-; GFX7-NEXT: v_and_b32_e32 v0, s7, v3
-; GFX7-NEXT: v_and_b32_e32 v6, s6, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, s35
-; GFX7-NEXT: v_not_b32_e32 v1, v0
-; GFX7-NEXT: v_not_b32_e32 v0, v6
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_and_b32_e32 v0, s7, v7
+; GFX7-NEXT: v_and_b32_e32 v1, s6, v6
+; GFX7-NEXT: v_not_b32_e32 v5, v0
+; GFX7-NEXT: v_not_b32_e32 v4, v1
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_cbranch_execnz .LBB57_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[36:37]
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
@@ -2533,27 +2527,25 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_load_dword v1, v[0:1]
; GFX8-NEXT: flat_load_dword v0, v[2:3]
-; GFX8-NEXT: s_mov_b64 s[36:37], 0
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, s34
-; GFX8-NEXT: v_and_b32_e32 v0, s7, v3
-; GFX8-NEXT: v_and_b32_e32 v6, s6, v2
-; GFX8-NEXT: v_mov_b32_e32 v5, s35
-; GFX8-NEXT: v_not_b32_e32 v1, v0
-; GFX8-NEXT: v_not_b32_e32 v0, v6
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_and_b32_e32 v0, s7, v7
+; GFX8-NEXT: v_and_b32_e32 v1, s6, v6
+; GFX8-NEXT: v_not_b32_e32 v5, v0
+; GFX8-NEXT: v_not_b32_e32 v4, v1
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_cbranch_execnz .LBB57_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[36:37]
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
@@ -2562,22 +2554,22 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_and_b32_e32 v0, s7, v3
-; GFX9-NEXT: v_and_b32_e32 v6, s6, v2
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_not_b32_e32 v1, v0
-; GFX9-NEXT: v_not_b32_e32 v0, v6
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT: v_mov_b32_e32 v7, v1
+; GFX9-NEXT: v_mov_b32_e32 v6, v0
+; GFX9-NEXT: v_and_b32_e32 v0, s7, v7
+; GFX9-NEXT: v_and_b32_e32 v1, s6, v6
+; GFX9-NEXT: v_not_b32_e32 v5, v0
+; GFX9-NEXT: v_not_b32_e32 v4, v1
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_cbranch_execnz .LBB57_1
@@ -3857,17 +3849,17 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GFX7-NEXT: v_mov_b32_e32 v4, s35
; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: flat_load_dword v3, v[3:4]
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s7
+; GFX7-NEXT: v_mov_b32_e32 v7, s6
+; GFX7-NEXT: v_mov_b32_e32 v5, s5
; GFX7-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
@@ -3892,17 +3884,17 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GFX8-NEXT: v_mov_b32_e32 v4, s35
; GFX8-NEXT: flat_load_dword v2, v[0:1]
; GFX8-NEXT: flat_load_dword v3, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s7
+; GFX8-NEXT: v_mov_b32_e32 v7, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
@@ -3922,17 +3914,17 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, s7
+; GFX9-NEXT: v_mov_b32_e32 v7, s6
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -3963,28 +3955,26 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
; GFX7-NEXT: v_mov_b32_e32 v5, s35
; GFX7-NEXT: flat_load_dword v3, v[0:1]
; GFX7-NEXT: flat_load_dword v2, v[4:5]
-; GFX7-NEXT: s_mov_b64 s[36:37], 0
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s7
+; GFX7-NEXT: v_mov_b32_e32 v7, s6
; GFX7-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, s34
-; GFX7-NEXT: v_mov_b32_e32 v5, s35
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_cbranch_execnz .LBB85_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[36:37]
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_max_i64_noret_offset_scalar:
@@ -4000,28 +3990,26 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
; GFX8-NEXT: v_mov_b32_e32 v5, s35
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: flat_load_dword v2, v[4:5]
-; GFX8-NEXT: s_mov_b64 s[36:37], 0
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s7
+; GFX8-NEXT: v_mov_b32_e32 v7, s6
; GFX8-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: v_mov_b32_e32 v4, s34
-; GFX8-NEXT: v_mov_b32_e32 v5, s35
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_cbranch_execnz .LBB85_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[36:37]
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_max_i64_noret_offset_scalar:
@@ -4030,17 +4018,17 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, s7
+; GFX9-NEXT: v_mov_b32_e32 v7, s6
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -4070,23 +4058,23 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_load_dword v0, v[0:1]
; GFX7-NEXT: flat_load_dword v1, v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s7
+; GFX7-NEXT: v_mov_b32_e32 v5, s6
+; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_cbranch_execnz .LBB86_1
@@ -4105,23 +4093,23 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: flat_load_dword v1, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s7
+; GFX8-NEXT: v_mov_b32_e32 v5, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_cbranch_execnz .LBB86_1
@@ -4135,23 +4123,23 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s7
+; GFX9-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-NEXT: v_mov_b32_e32 v8, v0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_cbranch_execnz .LBB86_1
@@ -4176,28 +4164,26 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_load_dword v1, v[0:1]
; GFX7-NEXT: flat_load_dword v0, v[2:3]
-; GFX7-NEXT: s_mov_b64 s[36:37], 0
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s7
+; GFX7-NEXT: v_mov_b32_e32 v5, s6
; GFX7-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, s34
-; GFX7-NEXT: v_mov_b32_e32 v5, s35
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_cbranch_execnz .LBB87_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[36:37]
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_max_i64_ret_offset_scalar:
@@ -4213,28 +4199,26 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_load_dword v1, v[0:1]
; GFX8-NEXT: flat_load_dword v0, v[2:3]
-; GFX8-NEXT: s_mov_b64 s[36:37], 0
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s7
+; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: v_mov_b32_e32 v4, s34
-; GFX8-NEXT: v_mov_b32_e32 v5, s35
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_cbranch_execnz .LBB87_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[36:37]
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_max_i64_ret_offset_scalar:
@@ -4243,23 +4227,23 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s7
+; GFX9-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-NEXT: v_mov_b32_e32 v8, v0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_cbranch_execnz .LBB87_1
@@ -4282,28 +4266,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v0, s0
-; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
; GFX7-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s3
-; GFX7-NEXT: v_mov_b32_e32 v6, s2
-; GFX7-NEXT: v_mov_b32_e32 v5, s1
-; GFX7-NEXT: v_mov_b32_e32 v4, s0
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB88_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
@@ -4318,28 +4300,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v6, s2
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB88_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
@@ -4352,28 +4332,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX9-NEXT: s_add_u32 s0, s0, s4
; GFX9-NEXT: s_addc_u32 s1, s1, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s1
+; GFX9-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, s3
+; GFX9-NEXT: v_mov_b32_e32 v7, s2
; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v6, s2
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB88_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
@@ -4396,32 +4374,30 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT: s_mov_b64 s[6:7], 0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s5
-; GFX7-NEXT: v_mov_b32_e32 v6, s4
-; GFX7-NEXT: v_mov_b32_e32 v5, s1
-; GFX7-NEXT: v_mov_b32_e32 v4, s0
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB89_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_max_i64_ret_addr64_offset:
@@ -4435,69 +4411,65 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s5
-; GFX8-NEXT: v_mov_b32_e32 v6, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB89_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
; GFX9-NEXT: s_add_u32 s0, s8, s0
; GFX9-NEXT: s_addc_u32 s1, s9, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s13
+; GFX9-NEXT: v_mov_b32_e32 v5, s12
; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s13
-; GFX9-NEXT: v_mov_b32_e32 v6, s12
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT: v_mov_b32_e32 v9, v3
+; GFX9-NEXT: v_mov_b32_e32 v8, v2
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB89_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v2, s10
-; GFX9-NEXT: v_mov_b32_e32 v3, s11
-; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s10
+; GFX9-NEXT: v_mov_b32_e32 v1, s11
+; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
@@ -4516,28 +4488,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
-; GFX7-NEXT: v_mov_b32_e32 v0, s0
-; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
; GFX7-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s3
-; GFX7-NEXT: v_mov_b32_e32 v6, s2
-; GFX7-NEXT: v_mov_b32_e32 v5, s1
-; GFX7-NEXT: v_mov_b32_e32 v4, s0
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB90_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
@@ -4550,28 +4520,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v6, s2
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB90_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
@@ -4584,28 +4552,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX9-NEXT: s_add_u32 s0, s0, s4
; GFX9-NEXT: s_addc_u32 s1, s1, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s1
+; GFX9-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, s3
+; GFX9-NEXT: v_mov_b32_e32 v7, s2
; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v6, s2
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB90_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
@@ -4625,32 +4591,30 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT: s_mov_b64 s[6:7], 0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s5
-; GFX7-NEXT: v_mov_b32_e32 v6, s4
-; GFX7-NEXT: v_mov_b32_e32 v5, s1
-; GFX7-NEXT: v_mov_b32_e32 v4, s0
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB91_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_max_i64_ret_addr64:
@@ -4662,69 +4626,65 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s5
-; GFX8-NEXT: v_mov_b32_e32 v6, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB91_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
; GFX9-NEXT: s_add_u32 s0, s8, s0
; GFX9-NEXT: s_addc_u32 s1, s9, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s13
+; GFX9-NEXT: v_mov_b32_e32 v5, s12
; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s13
-; GFX9-NEXT: v_mov_b32_e32 v6, s12
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT: v_mov_b32_e32 v9, v3
+; GFX9-NEXT: v_mov_b32_e32 v8, v2
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB91_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v2, s10
-; GFX9-NEXT: v_mov_b32_e32 v3, s11
-; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s10
+; GFX9-NEXT: v_mov_b32_e32 v1, s11
+; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
@@ -5271,17 +5231,17 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
; GFX7-NEXT: v_mov_b32_e32 v4, s35
; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: flat_load_dword v3, v[3:4]
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s7
+; GFX7-NEXT: v_mov_b32_e32 v7, s6
+; GFX7-NEXT: v_mov_b32_e32 v5, s5
; GFX7-NEXT: .LBB98_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
@@ -5306,17 +5266,17 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
; GFX8-NEXT: v_mov_b32_e32 v4, s35
; GFX8-NEXT: flat_load_dword v2, v[0:1]
; GFX8-NEXT: flat_load_dword v3, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s7
+; GFX8-NEXT: v_mov_b32_e32 v7, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: .LBB98_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
@@ -5336,17 +5296,17 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, s7
+; GFX9-NEXT: v_mov_b32_e32 v7, s6
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -5377,28 +5337,26 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
; GFX7-NEXT: v_mov_b32_e32 v5, s35
; GFX7-NEXT: flat_load_dword v3, v[0:1]
; GFX7-NEXT: flat_load_dword v2, v[4:5]
-; GFX7-NEXT: s_mov_b64 s[36:37], 0
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s7
+; GFX7-NEXT: v_mov_b32_e32 v7, s6
; GFX7-NEXT: .LBB99_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, s34
-; GFX7-NEXT: v_mov_b32_e32 v5, s35
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_cbranch_execnz .LBB99_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[36:37]
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
@@ -5414,28 +5372,26 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
; GFX8-NEXT: v_mov_b32_e32 v5, s35
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: flat_load_dword v2, v[4:5]
-; GFX8-NEXT: s_mov_b64 s[36:37], 0
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s7
+; GFX8-NEXT: v_mov_b32_e32 v7, s6
; GFX8-NEXT: .LBB99_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: v_mov_b32_e32 v4, s34
-; GFX8-NEXT: v_mov_b32_e32 v5, s35
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_cbranch_execnz .LBB99_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[36:37]
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
@@ -5444,17 +5400,17 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, s7
+; GFX9-NEXT: v_mov_b32_e32 v7, s6
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -5484,23 +5440,23 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_load_dword v0, v[0:1]
; GFX7-NEXT: flat_load_dword v1, v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s7
+; GFX7-NEXT: v_mov_b32_e32 v5, s6
+; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: .LBB100_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_cbranch_execnz .LBB100_1
@@ -5519,23 +5475,23 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: flat_load_dword v1, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s7
+; GFX8-NEXT: v_mov_b32_e32 v5, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: .LBB100_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_cbranch_execnz .LBB100_1
@@ -5549,23 +5505,23 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s7
+; GFX9-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-NEXT: v_mov_b32_e32 v8, v0
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_cbranch_execnz .LBB100_1
@@ -5590,28 +5546,26 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_load_dword v1, v[0:1]
; GFX7-NEXT: flat_load_dword v0, v[2:3]
-; GFX7-NEXT: s_mov_b64 s[36:37], 0
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s7
+; GFX7-NEXT: v_mov_b32_e32 v5, s6
; GFX7-NEXT: .LBB101_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, s34
-; GFX7-NEXT: v_mov_b32_e32 v5, s35
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_cbranch_execnz .LBB101_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[36:37]
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
@@ -5627,28 +5581,26 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_load_dword v1, v[0:1]
; GFX8-NEXT: flat_load_dword v0, v[2:3]
-; GFX8-NEXT: s_mov_b64 s[36:37], 0
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s7
+; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: .LBB101_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: v_mov_b32_e32 v4, s34
-; GFX8-NEXT: v_mov_b32_e32 v5, s35
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_cbranch_execnz .LBB101_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[36:37]
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
@@ -5657,23 +5609,23 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s7
+; GFX9-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-NEXT: v_mov_b32_e32 v8, v0
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_cbranch_execnz .LBB101_1
@@ -5696,28 +5648,26 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v0, s0
-; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
; GFX7-NEXT: .LBB102_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s3
-; GFX7-NEXT: v_mov_b32_e32 v6, s2
-; GFX7-NEXT: v_mov_b32_e32 v5, s1
-; GFX7-NEXT: v_mov_b32_e32 v4, s0
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB102_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
@@ -5732,28 +5682,26 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: .LBB102_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v6, s2
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB102_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
@@ -5766,28 +5714,26 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX9-NEXT: s_add_u32 s0, s0, s4
; GFX9-NEXT: s_addc_u32 s1, s1, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s1
+; GFX9-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, s3
+; GFX9-NEXT: v_mov_b32_e32 v7, s2
; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v6, s2
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB102_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
@@ -5810,32 +5756,30 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT: s_mov_b64 s[6:7], 0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: .LBB103_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s5
-; GFX7-NEXT: v_mov_b32_e32 v6, s4
-; GFX7-NEXT: v_mov_b32_e32 v5, s1
-; GFX7-NEXT: v_mov_b32_e32 v4, s0
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB103_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset:
@@ -5849,69 +5793,65 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: .LBB103_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s5
-; GFX8-NEXT: v_mov_b32_e32 v6, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB103_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
; GFX9-NEXT: s_add_u32 s0, s8, s0
; GFX9-NEXT: s_addc_u32 s1, s9, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s13
+; GFX9-NEXT: v_mov_b32_e32 v5, s12
; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s13
-; GFX9-NEXT: v_mov_b32_e32 v6, s12
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT: v_mov_b32_e32 v9, v3
+; GFX9-NEXT: v_mov_b32_e32 v8, v2
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB103_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v2, s10
-; GFX9-NEXT: v_mov_b32_e32 v3, s11
-; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s10
+; GFX9-NEXT: v_mov_b32_e32 v1, s11
+; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
@@ -5931,32 +5871,30 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT: s_mov_b64 s[6:7], 0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: .LBB104_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s5
-; GFX7-NEXT: v_mov_b32_e32 v6, s4
-; GFX7-NEXT: v_mov_b32_e32 v5, s1
-; GFX7-NEXT: v_mov_b32_e32 v4, s0
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB104_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umax_i64_ret_addr64:
@@ -5968,69 +5906,65 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: .LBB104_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s5
-; GFX8-NEXT: v_mov_b32_e32 v6, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB104_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
; GFX9-NEXT: s_add_u32 s0, s8, s0
; GFX9-NEXT: s_addc_u32 s1, s9, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s13
+; GFX9-NEXT: v_mov_b32_e32 v5, s12
; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s13
-; GFX9-NEXT: v_mov_b32_e32 v6, s12
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT: v_mov_b32_e32 v9, v3
+; GFX9-NEXT: v_mov_b32_e32 v8, v2
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB104_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v2, s10
-; GFX9-NEXT: v_mov_b32_e32 v3, s11
-; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s10
+; GFX9-NEXT: v_mov_b32_e32 v1, s11
+; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
@@ -6577,17 +6511,17 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
; GFX7-NEXT: v_mov_b32_e32 v4, s35
; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: flat_load_dword v3, v[3:4]
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s7
+; GFX7-NEXT: v_mov_b32_e32 v7, s6
+; GFX7-NEXT: v_mov_b32_e32 v5, s5
; GFX7-NEXT: .LBB111_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
@@ -6612,17 +6546,17 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
; GFX8-NEXT: v_mov_b32_e32 v4, s35
; GFX8-NEXT: flat_load_dword v2, v[0:1]
; GFX8-NEXT: flat_load_dword v3, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s7
+; GFX8-NEXT: v_mov_b32_e32 v7, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: .LBB111_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
@@ -6642,17 +6576,17 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, s7
+; GFX9-NEXT: v_mov_b32_e32 v7, s6
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -6683,28 +6617,26 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
; GFX7-NEXT: v_mov_b32_e32 v5, s35
; GFX7-NEXT: flat_load_dword v3, v[0:1]
; GFX7-NEXT: flat_load_dword v2, v[4:5]
-; GFX7-NEXT: s_mov_b64 s[36:37], 0
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s7
+; GFX7-NEXT: v_mov_b32_e32 v7, s6
; GFX7-NEXT: .LBB112_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, s34
-; GFX7-NEXT: v_mov_b32_e32 v5, s35
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_cbranch_execnz .LBB112_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[36:37]
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
@@ -6720,28 +6652,26 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
; GFX8-NEXT: v_mov_b32_e32 v5, s35
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: flat_load_dword v2, v[4:5]
-; GFX8-NEXT: s_mov_b64 s[36:37], 0
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s7
+; GFX8-NEXT: v_mov_b32_e32 v7, s6
; GFX8-NEXT: .LBB112_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: v_mov_b32_e32 v4, s34
-; GFX8-NEXT: v_mov_b32_e32 v5, s35
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_cbranch_execnz .LBB112_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[36:37]
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
@@ -6750,17 +6680,17 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, s7
+; GFX9-NEXT: v_mov_b32_e32 v7, s6
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -6790,23 +6720,23 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_load_dword v0, v[0:1]
; GFX7-NEXT: flat_load_dword v1, v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s7
+; GFX7-NEXT: v_mov_b32_e32 v5, s6
+; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: .LBB113_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_cbranch_execnz .LBB113_1
@@ -6825,23 +6755,23 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: flat_load_dword v1, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s7
+; GFX8-NEXT: v_mov_b32_e32 v5, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: .LBB113_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_cbranch_execnz .LBB113_1
@@ -6855,23 +6785,23 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s7
+; GFX9-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-NEXT: v_mov_b32_e32 v8, v0
+; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_cbranch_execnz .LBB113_1
@@ -6896,28 +6826,26 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_load_dword v1, v[0:1]
; GFX7-NEXT: flat_load_dword v0, v[2:3]
-; GFX7-NEXT: s_mov_b64 s[36:37], 0
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s7
+; GFX7-NEXT: v_mov_b32_e32 v5, s6
; GFX7-NEXT: .LBB114_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, s34
-; GFX7-NEXT: v_mov_b32_e32 v5, s35
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_cbranch_execnz .LBB114_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[36:37]
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
@@ -6933,28 +6861,26 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_load_dword v1, v[0:1]
; GFX8-NEXT: flat_load_dword v0, v[2:3]
-; GFX8-NEXT: s_mov_b64 s[36:37], 0
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s7
+; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: .LBB114_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: v_mov_b32_e32 v4, s34
-; GFX8-NEXT: v_mov_b32_e32 v5, s35
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_cbranch_execnz .LBB114_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[36:37]
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
@@ -6963,23 +6889,23 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s7
+; GFX9-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-NEXT: v_mov_b32_e32 v8, v0
+; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_cbranch_execnz .LBB114_1
@@ -7529,17 +7455,17 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GFX7-NEXT: v_mov_b32_e32 v4, s35
; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: flat_load_dword v3, v[3:4]
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s7
+; GFX7-NEXT: v_mov_b32_e32 v7, s6
+; GFX7-NEXT: v_mov_b32_e32 v5, s5
; GFX7-NEXT: .LBB121_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
@@ -7564,17 +7490,17 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GFX8-NEXT: v_mov_b32_e32 v4, s35
; GFX8-NEXT: flat_load_dword v2, v[0:1]
; GFX8-NEXT: flat_load_dword v3, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s7
+; GFX8-NEXT: v_mov_b32_e32 v7, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: .LBB121_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
@@ -7594,17 +7520,17 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, s7
+; GFX9-NEXT: v_mov_b32_e32 v7, s6
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -7635,28 +7561,26 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
; GFX7-NEXT: v_mov_b32_e32 v5, s35
; GFX7-NEXT: flat_load_dword v3, v[0:1]
; GFX7-NEXT: flat_load_dword v2, v[4:5]
-; GFX7-NEXT: s_mov_b64 s[36:37], 0
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s7
+; GFX7-NEXT: v_mov_b32_e32 v7, s6
; GFX7-NEXT: .LBB122_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, s34
-; GFX7-NEXT: v_mov_b32_e32 v5, s35
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_cbranch_execnz .LBB122_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[36:37]
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_min_i64_noret_offset_scalar:
@@ -7672,28 +7596,26 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
; GFX8-NEXT: v_mov_b32_e32 v5, s35
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: flat_load_dword v2, v[4:5]
-; GFX8-NEXT: s_mov_b64 s[36:37], 0
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s7
+; GFX8-NEXT: v_mov_b32_e32 v7, s6
; GFX8-NEXT: .LBB122_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: v_mov_b32_e32 v4, s34
-; GFX8-NEXT: v_mov_b32_e32 v5, s35
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_cbranch_execnz .LBB122_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[36:37]
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_min_i64_noret_offset_scalar:
@@ -7702,17 +7624,17 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, s7
+; GFX9-NEXT: v_mov_b32_e32 v7, s6
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -7742,23 +7664,23 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_load_dword v0, v[0:1]
; GFX7-NEXT: flat_load_dword v1, v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s7
+; GFX7-NEXT: v_mov_b32_e32 v5, s6
+; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: .LBB123_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_cbranch_execnz .LBB123_1
@@ -7777,23 +7699,23 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: flat_load_dword v1, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s7
+; GFX8-NEXT: v_mov_b32_e32 v5, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: .LBB123_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_cbranch_execnz .LBB123_1
@@ -7807,23 +7729,23 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s7
+; GFX9-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-NEXT: v_mov_b32_e32 v8, v0
+; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_cbranch_execnz .LBB123_1
@@ -7848,28 +7770,26 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_load_dword v1, v[0:1]
; GFX7-NEXT: flat_load_dword v0, v[2:3]
-; GFX7-NEXT: s_mov_b64 s[36:37], 0
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s7
+; GFX7-NEXT: v_mov_b32_e32 v5, s6
; GFX7-NEXT: .LBB124_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-NEXT: v_mov_b32_e32 v6, s6
-; GFX7-NEXT: v_mov_b32_e32 v4, s34
-; GFX7-NEXT: v_mov_b32_e32 v5, s35
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_cbranch_execnz .LBB124_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[36:37]
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_min_i64_ret_offset_scalar:
@@ -7885,28 +7805,26 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_load_dword v1, v[0:1]
; GFX8-NEXT: flat_load_dword v0, v[2:3]
-; GFX8-NEXT: s_mov_b64 s[36:37], 0
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s7
+; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: .LBB124_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s7
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: v_mov_b32_e32 v4, s34
-; GFX8-NEXT: v_mov_b32_e32 v5, s35
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_cbranch_execnz .LBB124_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[36:37]
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_min_i64_ret_offset_scalar:
@@ -7915,23 +7833,23 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s7
+; GFX9-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-NEXT: v_mov_b32_e32 v8, v0
+; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_cbranch_execnz .LBB124_1
@@ -7954,28 +7872,26 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v0, s0
-; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
; GFX7-NEXT: .LBB125_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s3
-; GFX7-NEXT: v_mov_b32_e32 v6, s2
-; GFX7-NEXT: v_mov_b32_e32 v5, s1
-; GFX7-NEXT: v_mov_b32_e32 v4, s0
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB125_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
@@ -7990,28 +7906,26 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: .LBB125_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v6, s2
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB125_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
@@ -8024,28 +7938,26 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX9-NEXT: s_add_u32 s0, s0, s4
; GFX9-NEXT: s_addc_u32 s1, s1, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s1
+; GFX9-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, s3
+; GFX9-NEXT: v_mov_b32_e32 v7, s2
; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v6, s2
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB125_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
@@ -8068,32 +7980,30 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT: s_mov_b64 s[6:7], 0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: .LBB126_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s5
-; GFX7-NEXT: v_mov_b32_e32 v6, s4
-; GFX7-NEXT: v_mov_b32_e32 v5, s1
-; GFX7-NEXT: v_mov_b32_e32 v4, s0
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB126_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_min_i64_ret_addr64_offset:
@@ -8107,69 +8017,65 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: .LBB126_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s5
-; GFX8-NEXT: v_mov_b32_e32 v6, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB126_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
; GFX9-NEXT: s_add_u32 s0, s8, s0
; GFX9-NEXT: s_addc_u32 s1, s9, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s13
+; GFX9-NEXT: v_mov_b32_e32 v5, s12
; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s13
-; GFX9-NEXT: v_mov_b32_e32 v6, s12
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT: v_mov_b32_e32 v9, v3
+; GFX9-NEXT: v_mov_b32_e32 v8, v2
+; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB126_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v2, s10
-; GFX9-NEXT: v_mov_b32_e32 v3, s11
-; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s10
+; GFX9-NEXT: v_mov_b32_e32 v1, s11
+; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
@@ -8188,16 +8094,16 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: .LBB127_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s3
-; GFX7-NEXT: v_mov_b32_e32 v6, s2
-; GFX7-NEXT: v_mov_b32_e32 v5, s1
-; GFX7-NEXT: v_mov_b32_e32 v4, s0
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
@@ -8218,16 +8124,16 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: .LBB127_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v6, s2
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
@@ -8248,16 +8154,16 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v5, s1
+; GFX9-NEXT: v_mov_b32_e32 v6, s3
+; GFX9-NEXT: v_mov_b32_e32 v7, s2
+; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v6, s2
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -8284,32 +8190,30 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT: s_mov_b64 s[6:7], 0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: .LBB128_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v0, s5
-; GFX7-NEXT: v_mov_b32_e32 v6, s4
-; GFX7-NEXT: v_mov_b32_e32 v5, s1
-; GFX7-NEXT: v_mov_b32_e32 v4, s0
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7-NEXT: s_cbranch_execnz .LBB128_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_min_i64_ret_addr64:
@@ -8321,69 +8225,65 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: .LBB128_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v0, s5
-; GFX8-NEXT: v_mov_b32_e32 v6, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB128_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
; GFX9-NEXT: s_add_u32 s0, s8, s0
; GFX9-NEXT: s_addc_u32 s1, s9, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s13
+; GFX9-NEXT: v_mov_b32_e32 v5, s12
; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s13
-; GFX9-NEXT: v_mov_b32_e32 v6, s12
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT: v_mov_b32_e32 v9, v3
+; GFX9-NEXT: v_mov_b32_e32 v8, v2
+; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB128_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v2, s10
-; GFX9-NEXT: v_mov_b32_e32 v3, s11
-; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s10
+; GFX9-NEXT: v_mov_b32_e32 v1, s11
+; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 09230d21544ff2..4aec2ffead4372 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -1426,23 +1426,23 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB50_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index d0fc798e55b6ea..ec4ea232e661cf 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -18501,39 +18501,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -18548,34 +18550,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB78_1
@@ -18589,33 +18591,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB78_1
@@ -18629,33 +18631,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB78_1
@@ -18669,34 +18671,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB78_1
@@ -18827,39 +18829,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -18874,34 +18878,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB79_1
@@ -18915,33 +18919,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB79_1
@@ -18955,33 +18959,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB79_1
@@ -18997,34 +19001,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_add_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB79_1
@@ -19155,39 +19159,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB80_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -19202,34 +19208,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB80_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB80_1
@@ -19243,33 +19249,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB80_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB80_1
@@ -19283,33 +19289,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB80_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB80_1
@@ -19325,34 +19331,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_add_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB80_1
@@ -19486,41 +19492,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB81_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -19533,35 +19539,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB81_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB81_1
@@ -19572,36 +19578,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB81_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB81_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -19611,36 +19617,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB81_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB81_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -19650,37 +19656,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB81_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -19804,41 +19810,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB82_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -19851,35 +19857,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB82_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB82_1
@@ -19890,36 +19896,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB82_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB82_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -19929,36 +19935,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB82_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB82_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -19970,37 +19976,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB82_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB82_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -20125,41 +20131,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB83_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -20172,35 +20178,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB83_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB83_1
@@ -20211,36 +20217,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB83_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB83_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -20250,36 +20256,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB83_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB83_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -20291,37 +20297,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB83_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB83_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -20456,39 +20462,41 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -20503,34 +20511,34 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB84_1
@@ -20544,35 +20552,35 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB84_1
@@ -20586,33 +20594,33 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB84_1
@@ -20628,34 +20636,34 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_add_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB84_1
@@ -20786,41 +20794,41 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -20833,35 +20841,35 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB85_1
@@ -20872,38 +20880,38 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB85_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -20913,36 +20921,36 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB85_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -20954,37 +20962,37 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB85_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -21110,39 +21118,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -21157,34 +21167,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB86_1
@@ -21198,33 +21208,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB86_1
@@ -21238,33 +21248,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB86_1
@@ -21278,34 +21288,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB86_1
@@ -21435,41 +21445,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr
; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -21482,35 +21492,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr
; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB87_1
@@ -21521,36 +21531,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr
; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB87_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -21560,36 +21570,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB87_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -21599,37 +21609,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr
; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB87_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -21754,39 +21764,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -21801,34 +21813,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB88_1
@@ -21842,33 +21854,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB88_1
@@ -21882,33 +21894,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB88_1
@@ -21922,34 +21934,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB88_1
@@ -22079,41 +22091,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -22126,35 +22138,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB89_1
@@ -22165,36 +22177,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB89_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -22204,36 +22216,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB89_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -22243,37 +22255,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB89_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -22398,39 +22410,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -22445,34 +22459,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB90_1
@@ -22486,33 +22500,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB90_1
@@ -22526,33 +22540,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB90_1
@@ -22566,34 +22580,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB90_1
@@ -22723,41 +22737,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1
; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -22770,35 +22784,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1
; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB91_1
@@ -22809,36 +22823,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1
; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB91_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -22848,36 +22862,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB91_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -22887,37 +22901,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1
; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_add_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB91_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -23156,34 +23170,34 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
;
; GFX8-LABEL: infer_as_before_atomic:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX8-NEXT: s_mov_b64 s[0:1], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB92_3
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, s2
-; GFX8-NEXT: s_mov_b64 s[2:3], 0
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX8-NEXT: s_bcnt1_i32_b64 s5, s[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v4, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: .LBB92_2: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_mov_b32_e32 v4, s1
-; GFX8-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_add_f32_e32 v2, v3, v4
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_cbranch_execnz .LBB92_2
; GFX8-NEXT: .LBB92_3:
; GFX8-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index d70159b0b0ac7e..e2fde562d36b11 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -32,13 +32,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -77,13 +77,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -101,13 +101,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -193,13 +193,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -238,13 +238,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -262,13 +262,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -356,13 +356,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -401,13 +401,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -425,13 +425,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -517,21 +517,21 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB3_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -561,20 +561,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB3_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -584,20 +584,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB3_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -672,21 +672,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB4_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -716,20 +716,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB4_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -739,20 +739,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB4_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -830,21 +830,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB5_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -874,20 +874,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -897,20 +897,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g
; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB5_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -991,13 +991,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1036,13 +1036,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -1062,13 +1062,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1155,21 +1155,21 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_
; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB7_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1199,22 +1199,22 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_
; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB7_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1224,20 +1224,20 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_
; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1317,13 +1317,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1341,14 +1341,15 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_max_f32 v3, v2, v2
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v5, v3
+; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1368,14 +1369,14 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX10-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v5, v3
+; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -1395,13 +1396,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -1419,13 +1420,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1546,13 +1547,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1591,13 +1592,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -1615,13 +1616,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1711,13 +1712,13 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1756,13 +1757,13 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -1780,13 +1781,13 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1872,13 +1873,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1917,13 +1918,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -1941,13 +1942,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -2035,13 +2036,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -2080,13 +2081,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -2104,13 +2105,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -2196,21 +2197,21 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB13_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2240,20 +2241,20 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2263,20 +2264,20 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem
; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2351,21 +2352,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB14_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2395,20 +2396,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2418,20 +2419,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB14_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2509,21 +2510,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB15_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2553,20 +2554,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB15_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2576,20 +2577,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f
; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB15_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2670,13 +2671,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -2715,13 +2716,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -2741,13 +2742,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -2834,21 +2835,21 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_
; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB17_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2878,22 +2879,22 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_
; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2903,20 +2904,20 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_
; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB17_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2990,15 +2991,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3028,15 +3029,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3074,15 +3075,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -3100,15 +3101,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -3163,15 +3164,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3201,15 +3202,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3247,15 +3248,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -3275,15 +3276,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v1
; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7]
+; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -3337,15 +3338,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3375,15 +3376,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3421,15 +3422,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -3449,15 +3450,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v1
; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7]
+; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -3510,21 +3511,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
+; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -3547,22 +3548,22 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off
+; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3592,21 +3593,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
+; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v5, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v4, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB21_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3616,21 +3617,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB21_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3673,21 +3674,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040
+; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -3710,22 +3711,22 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040
+; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3755,21 +3756,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040
+; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v5, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v4, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB22_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3781,21 +3782,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB22_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3839,21 +3840,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048
+; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -3876,22 +3877,22 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048
+; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3921,21 +3922,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048
+; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v5, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v4, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB23_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3947,21 +3948,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB23_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4006,15 +4007,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4044,15 +4045,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -4072,15 +4073,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v7, v5
; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX10-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX10-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -4101,13 +4102,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -4125,15 +4126,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -4151,15 +4152,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -4177,28 +4178,26 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_mov_b32_e32 v7, v1
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_mov_b32_e32 v4, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v11, v1
; GFX7-NEXT: v_mov_b32_e32 v10, v0
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT: v_max_f64 v[8:9], v[0:1], v[2:3]
+; GFX7-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7]
; GFX7-NEXT: v_mov_b32_e32 v0, v8
; GFX7-NEXT: v_mov_b32_e32 v1, v9
; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: v_mov_b32_e32 v3, v11
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
@@ -4213,14 +4212,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_mov_b32_e32 v7, v1
-; GFX6-NEXT: v_mov_b32_e32 v6, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
+; GFX6-NEXT: v_mov_b32_e32 v4, v0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4228,14 +4226,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX6-NEXT: v_mov_b32_e32 v11, v1
; GFX6-NEXT: v_mov_b32_e32 v10, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT: v_max_f64 v[8:9], v[0:1], v[2:3]
+; GFX6-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7]
; GFX6-NEXT: v_mov_b32_e32 v0, v8
; GFX6-NEXT: v_mov_b32_e32 v1, v9
; GFX6-NEXT: v_mov_b32_e32 v2, v10
; GFX6-NEXT: v_mov_b32_e32 v3, v11
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
@@ -4259,15 +4256,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4297,15 +4294,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -4343,15 +4340,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -4369,15 +4366,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -4436,8 +4433,9 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
@@ -4449,12 +4447,11 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7
+; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -4487,14 +4484,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v4, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT: v_max_f16_e32 v5, v6, v5
+; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc1
@@ -4514,8 +4511,9 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
@@ -4527,12 +4525,11 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v7
+; GFX11-NEXT: v_max_f16_e32 v5, v5, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -4556,6 +4553,7 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
@@ -4567,10 +4565,9 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX10-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT: v_max_f16_e32 v5, v5, v7
+; GFX10-NEXT: v_max_f16_e32 v5, v5, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -4599,14 +4596,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v4, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
@@ -4633,14 +4630,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX908-NEXT: v_not_b32_e32 v4, v4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT: v_max_f16_e32 v5, v7, v5
+; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
@@ -4667,17 +4664,17 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT: v_max_f16_e32 v5, v7, v5
-; GFX8-NEXT: v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -4790,10 +4787,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -4803,12 +4801,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7
+; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -4843,14 +4840,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v4, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT: v_max_f16_e32 v5, v6, v5
+; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc1
@@ -4871,10 +4868,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -4884,12 +4882,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v7
+; GFX11-NEXT: v_max_f16_e32 v5, v5, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -4914,9 +4911,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: global_load_dword v5, v[0:1], off
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -4925,10 +4923,9 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX10-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT: v_max_f16_e32 v5, v5, v7
+; GFX10-NEXT: v_max_f16_e32 v5, v5, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -4958,14 +4955,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v4, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
@@ -4993,14 +4990,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX908-NEXT: v_not_b32_e32 v4, v4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT: v_max_f16_e32 v5, v7, v5
+; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
@@ -5028,17 +5025,17 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT: v_max_f16_e32 v5, v7, v5
-; GFX8-NEXT: v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -5155,10 +5152,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -5168,12 +5166,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7
+; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -5209,14 +5206,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v4, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT: v_max_f16_e32 v5, v6, v5
+; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc1
@@ -5237,10 +5234,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -5250,12 +5248,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v7
+; GFX11-NEXT: v_max_f16_e32 v5, v5, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -5280,9 +5277,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: global_load_dword v5, v[0:1], off
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -5291,10 +5289,9 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX10-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT: v_max_f16_e32 v5, v5, v7
+; GFX10-NEXT: v_max_f16_e32 v5, v5, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -5324,14 +5321,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v4, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
@@ -5359,14 +5356,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX908-NEXT: v_not_b32_e32 v4, v4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT: v_max_f16_e32 v5, v7, v5
+; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
@@ -5394,17 +5391,17 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT: v_max_f16_e32 v5, v7, v5
-; GFX8-NEXT: v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -5520,8 +5517,9 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: global_load_b32 v4, v[0:1], off
@@ -5533,10 +5531,9 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7
+; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
@@ -5570,13 +5567,13 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v6, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_max_f16_e32 v7, v2, v2
; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT: v_max_f16_e32 v4, v4, v7
+; GFX940-NEXT: v_max_f16_e32 v4, v4, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX940-NEXT: buffer_wbl2 sc1
@@ -5596,8 +5593,9 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: global_load_b32 v4, v[0:1], off
@@ -5609,10 +5607,9 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v7
+; GFX11-NEXT: v_max_f16_e32 v3, v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
@@ -5637,6 +5634,7 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
@@ -5648,9 +5646,8 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
; GFX10-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v7
+; GFX10-NEXT: v_max_f16_e32 v3, v3, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -5679,13 +5676,13 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2
; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
@@ -5712,13 +5709,13 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
; GFX908-NEXT: v_not_b32_e32 v6, v3
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_max_f16_e32 v7, v2, v2
; GFX908-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v7
+; GFX908-NEXT: v_max_f16_e32 v3, v3, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
@@ -5745,16 +5742,16 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
; GFX8-NEXT: v_not_b32_e32 v6, v3
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_max_f16_e32 v7, v2, v2
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v7
-; GFX8-NEXT: v_and_b32_e32 v8, v4, v6
+; GFX8-NEXT: v_max_f16_e32 v3, v3, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v8, v3
+; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -5862,36 +5859,36 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -5909,29 +5906,29 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v6, v2, v2
; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT: v_max_f16_e32 v4, v4, v7
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB30_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5941,37 +5938,37 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
+; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -5983,31 +5980,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v5, v5
; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v7
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB30_1
@@ -6018,31 +6015,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB30_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6052,31 +6049,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v6, v2, v2
; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v7
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB30_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6086,32 +6083,32 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT: v_not_b32_e32 v5, v5
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v7
-; GFX8-NEXT: v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB30_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6216,36 +6213,36 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6264,29 +6261,29 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v6, v2, v2
; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT: v_max_f16_e32 v4, v4, v7
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB31_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6296,37 +6293,37 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
+; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6338,31 +6335,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v5, v5
; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v7
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB31_1
@@ -6373,31 +6370,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB31_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6407,31 +6404,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v6, v2, v2
; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v7
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB31_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6441,32 +6438,32 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT: v_not_b32_e32 v5, v5
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v7
-; GFX8-NEXT: v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB31_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6572,15 +6569,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4
-; GFX12-NEXT: v_max_num_f16_e32 v3, v5, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4
+; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
@@ -6605,14 +6602,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: s_mov_b32 s2, 0xffff0000
; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f16_e32 v4, v2, v2
; GFX940-NEXT: v_max_f16_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f16_e32 v3, v3, v4
+; GFX940-NEXT: v_max_f16_e32 v3, v3, v2
; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
@@ -6631,15 +6628,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: v_max_f16_e32 v3, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX11-NEXT: v_max_f16_e32 v3, v5, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f16_e32 v3, v4, v4
+; GFX11-NEXT: v_max_f16_e32 v3, v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
@@ -6662,14 +6659,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f16_e32 v3, v2, v2
-; GFX10-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX10-NEXT: v_max_f16_e32 v3, v5, v3
+; GFX10-NEXT: v_max_f16_e32 v3, v4, v4
+; GFX10-NEXT: v_max_f16_e32 v3, v3, v2
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -6691,14 +6688,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f16_e32 v3, v3, v4
+; GFX90A-NEXT: v_max_f16_e32 v3, v3, v2
; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6717,14 +6714,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: s_mov_b32 s6, 0xffff0000
; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f16_e32 v5, v2, v2
; GFX908-NEXT: v_max_f16_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v5
+; GFX908-NEXT: v_max_f16_e32 v3, v3, v2
; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -6745,19 +6742,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v1, v2, v2
; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v0, v1, v1
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v5
-; GFX8-NEXT: v_or_b32_e32 v0, v6, v0
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_max_f16_e32 v0, v6, v6
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT: v_or_b32_e32 v5, v2, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB32_1
@@ -6847,24 +6844,24 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4
+; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v5, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v4
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6878,23 +6875,23 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2046
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v4, v2, v2
; GFX940-NEXT: s_mov_b32 s2, 0xffff0000
; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f16_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f16_e32 v4, v5, v5
-; GFX940-NEXT: v_max_f16_e32 v3, v4, v3
-; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3
+; GFX940-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v4
+; GFX940-NEXT: v_and_or_b32 v2, v3, s2, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB33_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6904,25 +6901,25 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2046
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-NEXT: v_max_f16_e32 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_max_f16_e32 v3, v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f16_e32 v5, v4, v4
+; GFX11-NEXT: v_max_f16_e32 v2, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v5, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v4
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6934,24 +6931,24 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2046
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX10-NEXT: v_max_f16_e32 v4, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_max_f16_e32 v3, v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX10-NEXT: v_max_f16_e32 v3, v5, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v4
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB33_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6961,22 +6958,22 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2
; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f16_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f16_e32 v4, v5, v5
-; GFX90A-NEXT: v_max_f16_e32 v3, v4, v3
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v4
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB33_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6986,22 +6983,22 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2046
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v4, v2, v2
; GFX908-NEXT: s_mov_b32 s6, 0xffff0000
; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f16_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX908-NEXT: v_max_f16_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX908-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v4
+; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB33_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7013,22 +7010,22 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v4, v2, v2
; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_e32 v3, v2, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT: v_max_f16_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v6, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v4
+; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB33_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7117,10 +7114,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -7130,12 +7128,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7
+; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -7171,14 +7168,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v4, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT: v_max_f16_e32 v5, v6, v5
+; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
@@ -7199,10 +7196,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -7212,12 +7210,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v7
+; GFX11-NEXT: v_max_f16_e32 v5, v5, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -7242,9 +7239,10 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: global_load_dword v5, v[0:1], off
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -7253,10 +7251,9 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX10-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT: v_max_f16_e32 v5, v5, v7
+; GFX10-NEXT: v_max_f16_e32 v5, v5, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -7286,14 +7283,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v4, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX90A-NEXT: buffer_wbl2
@@ -7323,14 +7320,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX908-NEXT: v_not_b32_e32 v4, v4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT: v_max_f16_e32 v5, v7, v5
+; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
@@ -7358,17 +7355,17 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT: v_max_f16_e32 v5, v7, v5
-; GFX8-NEXT: v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -7483,37 +7480,37 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -7531,29 +7528,29 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v6, v2, v2
; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT: v_max_f16_e32 v4, v4, v7
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB35_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7563,37 +7560,37 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
+; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7605,31 +7602,31 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v5, v5
; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v7
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB35_1
@@ -7640,33 +7637,33 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB35_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7676,31 +7673,31 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v6, v2, v2
; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v7
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB35_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7710,32 +7707,32 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT: v_not_b32_e32 v5, v5
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v7
-; GFX8-NEXT: v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB35_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11684,15 +11681,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -11714,14 +11711,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v2
; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: v_pk_max_f16 v3, v5, v5
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_max_f16 v4, v3, v4
+; GFX940-NEXT: v_pk_max_f16 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -11739,15 +11736,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -11767,14 +11764,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX10-NEXT: v_pk_max_f16 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -11794,13 +11791,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4
+; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -11818,13 +11815,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v2
; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_pk_max_f16 v5, v2, v2
; GFX908-NEXT: v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT: v_pk_max_f16 v3, v3, v5
+; GFX908-NEXT: v_pk_max_f16 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -11842,21 +11839,21 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v3, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v6, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v3, v7, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v3, v5
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT: v_max_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB46_1
@@ -11977,15 +11974,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -12007,14 +12004,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v2
; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: v_pk_max_f16 v3, v5, v5
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_max_f16 v4, v3, v4
+; GFX940-NEXT: v_pk_max_f16 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -12032,15 +12029,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -12060,14 +12057,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX10-NEXT: v_pk_max_f16 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -12087,13 +12084,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4
+; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -12111,13 +12108,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v2
; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_pk_max_f16 v5, v2, v2
; GFX908-NEXT: v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT: v_pk_max_f16 v3, v3, v5
+; GFX908-NEXT: v_pk_max_f16 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -12137,21 +12134,21 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v0, v7, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB47_1
@@ -12272,15 +12269,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -12302,14 +12299,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v2
; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: v_pk_max_f16 v3, v5, v5
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_max_f16 v4, v3, v4
+; GFX940-NEXT: v_pk_max_f16 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -12327,15 +12324,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -12355,14 +12352,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX10-NEXT: v_pk_max_f16 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -12382,13 +12379,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4
+; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -12406,13 +12403,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v2
; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_pk_max_f16 v5, v2, v2
; GFX908-NEXT: v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT: v_pk_max_f16 v3, v3, v5
+; GFX908-NEXT: v_pk_max_f16 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -12432,21 +12429,21 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v0, v7, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB48_1
@@ -12570,21 +12567,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -12598,22 +12595,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_max_f16 v4, v4, v3
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB49_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12623,22 +12620,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12650,21 +12647,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_pk_max_f16 v4, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB49_1
@@ -12675,20 +12672,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB49_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12698,20 +12695,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v4, v2, v2
; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT: v_pk_max_f16 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB49_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12721,24 +12718,24 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v5, v7, v6
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v6, v6, v5
+; GFX8-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB49_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12852,21 +12849,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -12880,22 +12877,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_max_f16 v4, v4, v3
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB50_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12905,22 +12902,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12932,21 +12929,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_pk_max_f16 v4, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB50_1
@@ -12957,20 +12954,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB50_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12980,20 +12977,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v4, v2, v2
; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT: v_pk_max_f16 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB50_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13005,24 +13002,24 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v5, v7, v6
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v6, v6, v5
+; GFX8-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB50_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13137,21 +13134,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -13165,22 +13162,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_max_f16 v4, v4, v3
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB51_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13190,22 +13187,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13217,21 +13214,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT: v_pk_max_f16 v4, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB51_1
@@ -13242,20 +13239,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB51_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13265,20 +13262,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v4, v2, v2
; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT: v_pk_max_f16 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB51_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13290,24 +13287,24 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v5, v7, v6
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v6, v6, v5
+; GFX8-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB51_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13431,15 +13428,15 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@@ -13462,14 +13459,14 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v2
; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: v_pk_max_f16 v3, v5, v5
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_max_f16 v4, v3, v4
+; GFX940-NEXT: v_pk_max_f16 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -13487,15 +13484,15 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -13515,14 +13512,14 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX10-NEXT: v_pk_max_f16 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -13542,13 +13539,13 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4
+; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -13568,13 +13565,13 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v2
; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_pk_max_f16 v5, v2, v2
; GFX908-NEXT: v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT: v_pk_max_f16 v3, v3, v5
+; GFX908-NEXT: v_pk_max_f16 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -13594,21 +13591,21 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v0, v7, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB52_1
@@ -13728,22 +13725,22 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -13757,22 +13754,22 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin
; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_max_f16 v4, v4, v3
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB53_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13782,22 +13779,22 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin
; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13809,21 +13806,21 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin
; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_pk_max_f16 v4, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_max_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB53_1
@@ -13834,22 +13831,22 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin
; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB53_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13859,20 +13856,20 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin
; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v4, v2, v2
; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT: v_pk_max_f16 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB53_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13884,24 +13881,24 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v5, v7, v6
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v6, v6, v5
+; GFX8-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB53_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14021,36 +14018,38 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -14067,35 +14066,35 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB54_1
@@ -14108,39 +14107,41 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14155,34 +14156,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_max_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB54_1
@@ -14196,33 +14197,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB54_1
@@ -14236,33 +14237,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_max_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB54_1
@@ -14276,34 +14277,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_max_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB54_1
@@ -14416,36 +14417,38 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -14462,35 +14465,35 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB55_1
@@ -14503,39 +14506,41 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14550,34 +14555,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_max_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB55_1
@@ -14591,33 +14596,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB55_1
@@ -14631,33 +14636,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_max_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB55_1
@@ -14673,34 +14678,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_max_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_max_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB55_1
@@ -14813,36 +14818,38 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -14859,35 +14866,35 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB56_1
@@ -14900,39 +14907,41 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14947,34 +14956,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_max_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB56_1
@@ -14988,33 +14997,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB56_1
@@ -15028,33 +15037,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_max_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB56_1
@@ -15070,34 +15079,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_max_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_max_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB56_1
@@ -15213,38 +15222,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -15258,38 +15267,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB57_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15299,41 +15308,41 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15346,35 +15355,35 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB57_1
@@ -15385,36 +15394,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB57_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15424,36 +15433,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB57_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15463,37 +15472,37 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB57_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15599,38 +15608,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -15644,38 +15653,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB58_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15685,41 +15694,41 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_max_f32_e32 v6, v6, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15732,35 +15741,35 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB58_1
@@ -15771,36 +15780,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB58_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15810,36 +15819,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB58_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15851,37 +15860,37 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB58_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15988,38 +15997,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -16033,38 +16042,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB59_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16074,41 +16083,41 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16121,35 +16130,35 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB59_1
@@ -16160,36 +16169,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB59_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16199,36 +16208,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB59_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16240,37 +16249,37 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB59_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16386,37 +16395,39 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -16433,35 +16444,35 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB60_1
@@ -16474,39 +16485,41 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16521,34 +16534,34 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_max_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB60_1
@@ -16562,35 +16575,35 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB60_1
@@ -16604,33 +16617,33 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_max_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_max_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB60_1
@@ -16646,34 +16659,34 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_max_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_max_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB60_1
@@ -16785,39 +16798,39 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -16831,38 +16844,38 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB61_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16872,41 +16885,41 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16919,35 +16932,35 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB61_1
@@ -16958,38 +16971,38 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX90A-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB61_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16999,36 +17012,36 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX908-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB61_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17040,37 +17053,37 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_max_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB61_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index ecea5fdecfcd0e..903e80b15814fd 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -32,13 +32,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -77,13 +77,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -101,13 +101,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -193,13 +193,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -238,13 +238,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -262,13 +262,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -356,13 +356,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -401,13 +401,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -425,13 +425,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -517,21 +517,21 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB3_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -561,20 +561,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB3_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -584,20 +584,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB3_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -672,21 +672,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB4_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -716,20 +716,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB4_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -739,20 +739,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB4_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -830,21 +830,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB5_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -874,20 +874,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -897,20 +897,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g
; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB5_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -991,13 +991,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1036,13 +1036,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -1062,13 +1062,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1155,21 +1155,21 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_
; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB7_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1199,22 +1199,22 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_
; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB7_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1224,20 +1224,20 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_
; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1317,13 +1317,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1341,14 +1341,15 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_max_f32 v3, v2, v2
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v5, v3
+; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1368,14 +1369,14 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v2, v2
-; GFX10-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v5, v3
+; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -1395,13 +1396,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -1419,13 +1420,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1546,13 +1547,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1591,13 +1592,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -1615,13 +1616,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1711,13 +1712,13 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1756,13 +1757,13 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -1780,13 +1781,13 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1872,13 +1873,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1917,13 +1918,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -1941,13 +1942,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -2035,13 +2036,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -2080,13 +2081,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -2104,13 +2105,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -2196,21 +2197,21 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB13_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2240,20 +2241,20 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2263,20 +2264,20 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem
; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2351,21 +2352,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB14_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2395,20 +2396,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2418,20 +2419,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB14_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2509,21 +2510,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB15_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2553,20 +2554,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB15_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2576,20 +2577,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f
; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB15_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2670,13 +2671,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -2715,13 +2716,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -2741,13 +2742,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -2834,21 +2835,21 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_
; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB17_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2878,22 +2879,22 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_
; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2903,20 +2904,20 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_
; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB17_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2990,15 +2991,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3028,15 +3029,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3074,15 +3075,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -3100,15 +3101,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -3163,15 +3164,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3201,15 +3202,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3247,15 +3248,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -3275,15 +3276,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v1
; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7]
+; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -3337,15 +3338,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3375,15 +3376,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -3421,15 +3422,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -3449,15 +3450,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v9, v1
; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7]
+; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -3510,21 +3511,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
+; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -3547,22 +3548,22 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off
+; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3592,21 +3593,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
+; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v5, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v4, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB21_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3616,21 +3617,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB21_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3673,21 +3674,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040
+; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -3710,22 +3711,22 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040
+; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3755,21 +3756,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040
+; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v5, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v4, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB22_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3781,21 +3782,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB22_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3839,21 +3840,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048
+; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -3876,22 +3877,22 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048
+; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048
+; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3921,21 +3922,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048
+; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v5, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v4, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB23_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3947,21 +3948,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB23_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4006,15 +4007,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4044,15 +4045,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -4072,15 +4073,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v7, v5
; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX10-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX10-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -4101,13 +4102,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -4125,15 +4126,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -4151,15 +4152,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -4177,28 +4178,26 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_mov_b32_e32 v7, v1
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_mov_b32_e32 v4, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v11, v1
; GFX7-NEXT: v_mov_b32_e32 v10, v0
-; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT: v_min_f64 v[8:9], v[0:1], v[2:3]
+; GFX7-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7]
; GFX7-NEXT: v_mov_b32_e32 v0, v8
; GFX7-NEXT: v_mov_b32_e32 v1, v9
; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: v_mov_b32_e32 v3, v11
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
@@ -4213,14 +4212,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_mov_b32_e32 v7, v1
-; GFX6-NEXT: v_mov_b32_e32 v6, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
+; GFX6-NEXT: v_mov_b32_e32 v4, v0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4228,14 +4226,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX6-NEXT: v_mov_b32_e32 v11, v1
; GFX6-NEXT: v_mov_b32_e32 v10, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT: v_min_f64 v[8:9], v[0:1], v[2:3]
+; GFX6-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7]
; GFX6-NEXT: v_mov_b32_e32 v0, v8
; GFX6-NEXT: v_mov_b32_e32 v1, v9
; GFX6-NEXT: v_mov_b32_e32 v2, v10
; GFX6-NEXT: v_mov_b32_e32 v3, v11
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
@@ -4259,15 +4256,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
-; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4297,15 +4294,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
-; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -4343,15 +4340,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -4369,15 +4366,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -4436,8 +4433,9 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
@@ -4449,12 +4447,11 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7
+; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -4487,14 +4484,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v4, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT: v_min_f16_e32 v5, v6, v5
+; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT: v_min_f16_e32 v5, v5, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc1
@@ -4514,8 +4511,9 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
@@ -4527,12 +4525,11 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v5, v5, v7
+; GFX11-NEXT: v_min_f16_e32 v5, v5, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -4556,6 +4553,7 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
@@ -4567,10 +4565,9 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX10-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT: v_min_f16_e32 v5, v5, v7
+; GFX10-NEXT: v_min_f16_e32 v5, v5, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -4599,14 +4596,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v4, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
@@ -4633,14 +4630,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX908-NEXT: v_not_b32_e32 v4, v4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT: v_min_f16_e32 v5, v7, v5
+; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT: v_min_f16_e32 v5, v5, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
@@ -4667,17 +4664,17 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT: v_min_f16_e32 v5, v7, v5
-; GFX8-NEXT: v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -4790,10 +4787,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -4803,12 +4801,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7
+; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -4843,14 +4840,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v4, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT: v_min_f16_e32 v5, v6, v5
+; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT: v_min_f16_e32 v5, v5, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc1
@@ -4871,10 +4868,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -4884,12 +4882,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v5, v5, v7
+; GFX11-NEXT: v_min_f16_e32 v5, v5, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -4914,9 +4911,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: global_load_dword v5, v[0:1], off
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -4925,10 +4923,9 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX10-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT: v_min_f16_e32 v5, v5, v7
+; GFX10-NEXT: v_min_f16_e32 v5, v5, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -4958,14 +4955,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v4, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
@@ -4993,14 +4990,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX908-NEXT: v_not_b32_e32 v4, v4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT: v_min_f16_e32 v5, v7, v5
+; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT: v_min_f16_e32 v5, v5, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
@@ -5028,17 +5025,17 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT: v_min_f16_e32 v5, v7, v5
-; GFX8-NEXT: v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -5155,10 +5152,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -5168,12 +5166,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7
+; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -5209,14 +5206,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v4, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT: v_min_f16_e32 v5, v6, v5
+; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT: v_min_f16_e32 v5, v5, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc1
@@ -5237,10 +5234,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -5250,12 +5248,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v5, v5, v7
+; GFX11-NEXT: v_min_f16_e32 v5, v5, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -5280,9 +5277,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: global_load_dword v5, v[0:1], off
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -5291,10 +5289,9 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX10-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT: v_min_f16_e32 v5, v5, v7
+; GFX10-NEXT: v_min_f16_e32 v5, v5, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -5324,14 +5321,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v4, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
@@ -5359,14 +5356,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX908-NEXT: v_not_b32_e32 v4, v4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT: v_min_f16_e32 v5, v7, v5
+; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT: v_min_f16_e32 v5, v5, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
@@ -5394,17 +5391,17 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT: v_min_f16_e32 v5, v7, v5
-; GFX8-NEXT: v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -5520,8 +5517,9 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: global_load_b32 v4, v[0:1], off
@@ -5533,10 +5531,9 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7
+; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
@@ -5570,13 +5567,13 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v6, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_max_f16_e32 v7, v2, v2
; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT: v_min_f16_e32 v4, v4, v7
+; GFX940-NEXT: v_min_f16_e32 v4, v4, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX940-NEXT: buffer_wbl2 sc1
@@ -5596,8 +5593,9 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: global_load_b32 v4, v[0:1], off
@@ -5609,10 +5607,9 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: v_min_f16_e32 v3, v3, v7
+; GFX11-NEXT: v_min_f16_e32 v3, v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
@@ -5637,6 +5634,7 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
@@ -5648,9 +5646,8 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
; GFX10-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT: v_min_f16_e32 v3, v3, v7
+; GFX10-NEXT: v_min_f16_e32 v3, v3, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -5679,13 +5676,13 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2
; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7
+; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
@@ -5712,13 +5709,13 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
; GFX908-NEXT: v_not_b32_e32 v6, v3
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_max_f16_e32 v7, v2, v2
; GFX908-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT: v_min_f16_e32 v3, v3, v7
+; GFX908-NEXT: v_min_f16_e32 v3, v3, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
@@ -5745,16 +5742,16 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
; GFX8-NEXT: v_not_b32_e32 v6, v3
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_max_f16_e32 v7, v2, v2
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_min_f16_e32 v3, v3, v7
-; GFX8-NEXT: v_and_b32_e32 v8, v4, v6
+; GFX8-NEXT: v_min_f16_e32 v3, v3, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v8, v3
+; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -5862,36 +5859,36 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -5909,29 +5906,29 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v6, v2, v2
; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT: v_min_f16_e32 v4, v4, v7
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB30_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5941,37 +5938,37 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
+; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: v_min_f16_e32 v3, v3, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -5983,31 +5980,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v5, v5
; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT: v_min_f16_e32 v3, v3, v7
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB30_1
@@ -6018,31 +6015,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB30_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6052,31 +6049,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v6, v2, v2
; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT: v_min_f16_e32 v3, v3, v7
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB30_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6086,32 +6083,32 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT: v_not_b32_e32 v5, v5
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_min_f16_e32 v3, v3, v7
-; GFX8-NEXT: v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB30_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6216,36 +6213,36 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6264,29 +6261,29 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v6, v2, v2
; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT: v_min_f16_e32 v4, v4, v7
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB31_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6296,37 +6293,37 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
+; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: v_min_f16_e32 v3, v3, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6338,31 +6335,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v5, v5
; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT: v_min_f16_e32 v3, v3, v7
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB31_1
@@ -6373,31 +6370,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB31_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6407,31 +6404,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v6, v2, v2
; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT: v_min_f16_e32 v3, v3, v7
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB31_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6441,32 +6438,32 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT: v_not_b32_e32 v5, v5
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_min_f16_e32 v3, v3, v7
-; GFX8-NEXT: v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB31_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6572,15 +6569,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4
-; GFX12-NEXT: v_min_num_f16_e32 v3, v5, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4
+; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
@@ -6605,14 +6602,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: s_mov_b32 s2, 0xffff0000
; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f16_e32 v4, v2, v2
; GFX940-NEXT: v_max_f16_e32 v3, v5, v5
-; GFX940-NEXT: v_min_f16_e32 v3, v3, v4
+; GFX940-NEXT: v_min_f16_e32 v3, v3, v2
; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
@@ -6631,15 +6628,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: v_max_f16_e32 v3, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX11-NEXT: v_min_f16_e32 v3, v5, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f16_e32 v3, v4, v4
+; GFX11-NEXT: v_min_f16_e32 v3, v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
@@ -6662,14 +6659,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f16_e32 v3, v2, v2
-; GFX10-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX10-NEXT: v_min_f16_e32 v3, v5, v3
+; GFX10-NEXT: v_max_f16_e32 v3, v4, v4
+; GFX10-NEXT: v_min_f16_e32 v3, v3, v2
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -6691,14 +6688,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2
; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f16_e32 v3, v3, v4
+; GFX90A-NEXT: v_min_f16_e32 v3, v3, v2
; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6717,14 +6714,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: s_mov_b32 s6, 0xffff0000
; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f16_e32 v5, v2, v2
; GFX908-NEXT: v_max_f16_e32 v3, v4, v4
-; GFX908-NEXT: v_min_f16_e32 v3, v3, v5
+; GFX908-NEXT: v_min_f16_e32 v3, v3, v2
; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -6745,19 +6742,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v1, v2, v2
; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v0, v1, v1
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX8-NEXT: v_min_f16_e32 v0, v0, v5
-; GFX8-NEXT: v_or_b32_e32 v0, v6, v0
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_max_f16_e32 v0, v6, v6
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX8-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT: v_or_b32_e32 v5, v2, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB32_1
@@ -6847,24 +6844,24 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4
+; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v3, v5, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v4
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6878,23 +6875,23 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2046
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v4, v2, v2
; GFX940-NEXT: s_mov_b32 s2, 0xffff0000
; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_max_f16_e32 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f16_e32 v4, v5, v5
-; GFX940-NEXT: v_min_f16_e32 v3, v4, v3
-; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3
+; GFX940-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX940-NEXT: v_min_f16_e32 v2, v2, v4
+; GFX940-NEXT: v_and_or_b32 v2, v3, s2, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB33_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6904,25 +6901,25 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2046
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-NEXT: v_max_f16_e32 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_max_f16_e32 v3, v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f16_e32 v5, v4, v4
+; GFX11-NEXT: v_max_f16_e32 v2, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v3, v5, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_min_f16_e32 v2, v2, v4
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6934,24 +6931,24 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2046
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX10-NEXT: v_max_f16_e32 v4, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_max_f16_e32 v3, v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX10-NEXT: v_min_f16_e32 v3, v5, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX10-NEXT: v_min_f16_e32 v2, v2, v4
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB33_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6961,22 +6958,22 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2
; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_max_f16_e32 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f16_e32 v4, v5, v5
-; GFX90A-NEXT: v_min_f16_e32 v3, v4, v3
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f16_e32 v2, v2, v4
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB33_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6986,22 +6983,22 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2046
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v4, v2, v2
; GFX908-NEXT: s_mov_b32 s6, 0xffff0000
; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_max_f16_e32 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX908-NEXT: v_min_f16_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX908-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX908-NEXT: v_min_f16_e32 v2, v2, v4
+; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB33_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7013,22 +7010,22 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v4, v2, v2
; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_e32 v3, v2, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT: v_min_f16_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v6, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_min_f16_e32 v2, v2, v4
+; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB33_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7117,10 +7114,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -7130,12 +7128,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7
+; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -7171,14 +7168,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v4, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX940-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX940-NEXT: v_min_f16_e32 v5, v6, v5
+; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX940-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX940-NEXT: v_min_f16_e32 v5, v5, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
@@ -7199,10 +7196,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -7212,12 +7210,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v5, v5, v7
+; GFX11-NEXT: v_min_f16_e32 v5, v5, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
@@ -7242,9 +7239,10 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: global_load_dword v5, v[0:1], off
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
@@ -7253,10 +7251,9 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX10-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX10-NEXT: v_min_f16_e32 v5, v5, v7
+; GFX10-NEXT: v_min_f16_e32 v5, v5, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -7286,14 +7283,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v4, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6
-; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX90A-NEXT: buffer_wbl2
@@ -7323,14 +7320,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX908-NEXT: v_not_b32_e32 v4, v4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX908-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX908-NEXT: v_min_f16_e32 v5, v7, v5
+; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX908-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX908-NEXT: v_min_f16_e32 v5, v5, v2
; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
@@ -7358,17 +7355,17 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6
-; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
-; GFX8-NEXT: v_min_f16_e32 v5, v7, v5
-; GFX8-NEXT: v_and_b32_e32 v8, v6, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -7483,37 +7480,37 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -7531,29 +7528,29 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_max_f16_e32 v6, v2, v2
; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX940-NEXT: v_min_f16_e32 v4, v4, v7
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX940-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX940-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB35_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7563,37 +7560,37 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
+; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: v_min_f16_e32 v3, v3, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7605,31 +7602,31 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v5, v5
; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT: v_min_f16_e32 v3, v3, v7
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX10-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX10-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB35_1
@@ -7640,33 +7637,33 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB35_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7676,31 +7673,31 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_max_f16_e32 v6, v2, v2
; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT: v_min_f16_e32 v3, v3, v7
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX908-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX908-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB35_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7710,32 +7707,32 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT: v_not_b32_e32 v5, v5
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_max_f16_e32 v7, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_min_f16_e32 v3, v3, v7
-; GFX8-NEXT: v_and_b32_e32 v8, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v8, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB35_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11684,15 +11681,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -11714,14 +11711,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v2
; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: v_pk_max_f16 v3, v5, v5
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_min_f16 v4, v3, v4
+; GFX940-NEXT: v_pk_min_f16 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -11739,15 +11736,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT: v_pk_min_f16 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -11767,14 +11764,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX10-NEXT: v_pk_min_f16 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -11794,13 +11791,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4
+; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -11818,13 +11815,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v2
; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_pk_max_f16 v5, v2, v2
; GFX908-NEXT: v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT: v_pk_min_f16 v3, v3, v5
+; GFX908-NEXT: v_pk_min_f16 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -11842,21 +11839,21 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v3, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v6, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v3, v7, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v3, v5
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT: v_min_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB46_1
@@ -11977,15 +11974,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -12007,14 +12004,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v2
; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: v_pk_max_f16 v3, v5, v5
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_min_f16 v4, v3, v4
+; GFX940-NEXT: v_pk_min_f16 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -12032,15 +12029,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT: v_pk_min_f16 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -12060,14 +12057,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX10-NEXT: v_pk_min_f16 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -12087,13 +12084,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4
+; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -12111,13 +12108,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v2
; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_pk_max_f16 v5, v2, v2
; GFX908-NEXT: v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT: v_pk_min_f16 v3, v3, v5
+; GFX908-NEXT: v_pk_min_f16 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -12137,21 +12134,21 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v0, v7, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB47_1
@@ -12272,15 +12269,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -12302,14 +12299,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v2
; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: v_pk_max_f16 v3, v5, v5
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_min_f16 v4, v3, v4
+; GFX940-NEXT: v_pk_min_f16 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -12327,15 +12324,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT: v_pk_min_f16 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -12355,14 +12352,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX10-NEXT: v_pk_min_f16 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -12382,13 +12379,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4
+; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -12406,13 +12403,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v2
; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_pk_max_f16 v5, v2, v2
; GFX908-NEXT: v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT: v_pk_min_f16 v3, v3, v5
+; GFX908-NEXT: v_pk_min_f16 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -12432,21 +12429,21 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v0, v7, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB48_1
@@ -12570,21 +12567,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -12598,22 +12595,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_min_f16 v4, v4, v3
+; GFX940-NEXT: v_pk_min_f16 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB49_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12623,22 +12620,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT: v_pk_min_f16 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12650,21 +12647,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_pk_max_f16 v4, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT: v_pk_min_f16 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB49_1
@@ -12675,20 +12672,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB49_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12698,20 +12695,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v4, v2, v2
; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT: v_pk_min_f16 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB49_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12721,24 +12718,24 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v5, v7, v6
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v6, v6, v5
+; GFX8-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB49_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12852,21 +12849,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -12880,22 +12877,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_min_f16 v4, v4, v3
+; GFX940-NEXT: v_pk_min_f16 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB50_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12905,22 +12902,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT: v_pk_min_f16 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12932,21 +12929,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_pk_max_f16 v4, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT: v_pk_min_f16 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB50_1
@@ -12957,20 +12954,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB50_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12980,20 +12977,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v4, v2, v2
; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT: v_pk_min_f16 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB50_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13005,24 +13002,24 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v5, v7, v6
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v6, v6, v5
+; GFX8-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB50_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13137,21 +13134,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -13165,22 +13162,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_min_f16 v4, v4, v3
+; GFX940-NEXT: v_pk_min_f16 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB51_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13190,22 +13187,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT: v_pk_min_f16 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13217,21 +13214,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT: v_pk_max_f16 v4, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT: v_pk_min_f16 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB51_1
@@ -13242,20 +13239,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB51_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13265,20 +13262,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v4, v2, v2
; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT: v_pk_min_f16 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB51_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13290,24 +13287,24 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v5, v7, v6
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v6, v6, v5
+; GFX8-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB51_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13431,15 +13428,15 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
-; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
+; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@@ -13462,14 +13459,14 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v2
; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: v_pk_max_f16 v3, v5, v5
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_min_f16 v4, v3, v4
+; GFX940-NEXT: v_pk_min_f16 v4, v3, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -13487,15 +13484,15 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX11-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX11-NEXT: v_pk_min_f16 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -13515,14 +13512,14 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX10-NEXT: v_pk_min_f16 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -13542,13 +13539,13 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4
+; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -13568,13 +13565,13 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v2
; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_pk_max_f16 v5, v2, v2
; GFX908-NEXT: v_pk_max_f16 v3, v4, v4
-; GFX908-NEXT: v_pk_min_f16 v3, v3, v5
+; GFX908-NEXT: v_pk_min_f16 v3, v3, v2
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -13594,21 +13591,21 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v7, v1, v1
-; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v0, v7, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
+; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB52_1
@@ -13728,22 +13725,22 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3
+; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -13757,22 +13754,22 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin
; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v4, v5, v5
+; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_min_f16 v4, v4, v3
+; GFX940-NEXT: v_pk_min_f16 v2, v2, v4
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB53_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13782,22 +13779,22 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin
; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v5, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX11-NEXT: v_pk_min_f16 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13809,21 +13806,21 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin
; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_pk_max_f16 v4, v2, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX10-NEXT: v_pk_min_f16 v3, v5, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT: v_pk_min_f16 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB53_1
@@ -13834,22 +13831,22 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin
; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB53_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13859,20 +13856,20 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin
; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v4, v2, v2
; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v5, v4, v4
-; GFX908-NEXT: v_pk_min_f16 v3, v5, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB53_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13884,24 +13881,24 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v2, v2
; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v7, v4, v4
-; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v5, v7, v6
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v6, v3, v3
+; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v6, v6, v5
+; GFX8-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB53_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14021,36 +14018,38 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -14067,35 +14066,35 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB54_1
@@ -14108,39 +14107,41 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14155,34 +14156,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_min_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB54_1
@@ -14196,33 +14197,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB54_1
@@ -14236,33 +14237,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_min_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB54_1
@@ -14276,34 +14277,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_min_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB54_1
@@ -14416,36 +14417,38 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -14462,35 +14465,35 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB55_1
@@ -14503,39 +14506,41 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14550,34 +14555,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_min_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB55_1
@@ -14591,33 +14596,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB55_1
@@ -14631,33 +14636,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_min_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB55_1
@@ -14673,34 +14678,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_min_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_min_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB55_1
@@ -14813,36 +14818,38 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -14859,35 +14866,35 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB56_1
@@ -14900,39 +14907,41 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14947,34 +14956,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_min_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB56_1
@@ -14988,33 +14997,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB56_1
@@ -15028,33 +15037,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_min_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB56_1
@@ -15070,34 +15079,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_min_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_min_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB56_1
@@ -15213,38 +15222,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -15258,38 +15267,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_min_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB57_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15299,41 +15308,41 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15346,35 +15355,35 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB57_1
@@ -15385,36 +15394,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB57_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15424,36 +15433,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB57_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15463,37 +15472,37 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB57_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15599,38 +15608,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -15644,38 +15653,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_min_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB58_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15685,41 +15694,41 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_min_f32_e32 v6, v6, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15732,35 +15741,35 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB58_1
@@ -15771,36 +15780,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB58_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15810,36 +15819,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB58_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15851,37 +15860,37 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB58_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15988,38 +15997,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -16033,38 +16042,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_min_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB59_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16074,41 +16083,41 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16121,35 +16130,35 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB59_1
@@ -16160,36 +16169,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB59_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16199,36 +16208,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB59_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16240,37 +16249,37 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB59_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16386,37 +16395,39 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -16433,35 +16444,35 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB60_1
@@ -16474,39 +16485,41 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16521,34 +16534,34 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_min_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB60_1
@@ -16562,35 +16575,35 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB60_1
@@ -16604,33 +16617,33 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_min_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_min_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB60_1
@@ -16646,34 +16659,34 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_min_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_min_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB60_1
@@ -16785,39 +16798,39 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -16831,38 +16844,38 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_min_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB61_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16872,41 +16885,41 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16919,35 +16932,35 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB61_1
@@ -16958,38 +16971,38 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX90A-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB61_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16999,36 +17012,36 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX908-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB61_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17040,37 +17053,37 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_min_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB61_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index 88a7a5f4d9c6bf..3dbf6477a7cb89 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -14477,36 +14477,38 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -14523,35 +14525,35 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB50_1
@@ -14564,39 +14566,41 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -14611,34 +14615,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB50_1
@@ -14652,33 +14656,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB50_1
@@ -14692,33 +14696,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB50_1
@@ -14732,34 +14736,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB50_1
@@ -14872,36 +14876,38 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -14918,35 +14924,35 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB51_1
@@ -14959,39 +14965,41 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15006,34 +15014,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB51_1
@@ -15047,33 +15055,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB51_1
@@ -15087,33 +15095,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB51_1
@@ -15129,34 +15137,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB51_1
@@ -15269,36 +15277,38 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -15315,35 +15325,35 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB52_1
@@ -15356,39 +15366,41 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15403,34 +15415,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB52_1
@@ -15444,33 +15456,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB52_1
@@ -15484,33 +15496,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB52_1
@@ -15526,34 +15538,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB52_1
@@ -15669,38 +15681,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -15714,38 +15726,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB53_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15755,41 +15767,41 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -15802,35 +15814,35 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB53_1
@@ -15841,36 +15853,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2bf16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB53_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15880,36 +15892,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX908-LABEL: global_agent_atomic_fsub_noret_v2bf16:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB53_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15919,37 +15931,37 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX8-LABEL: global_agent_atomic_fsub_noret_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB53_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16055,38 +16067,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -16100,38 +16112,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB54_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16141,41 +16153,41 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16188,35 +16200,35 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB54_1
@@ -16227,36 +16239,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB54_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16266,36 +16278,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX908-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB54_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16307,37 +16319,37 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB54_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16444,38 +16456,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -16489,38 +16501,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB55_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16530,41 +16542,41 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16577,35 +16589,35 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB55_1
@@ -16616,36 +16628,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB55_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16655,36 +16667,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX908-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB55_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16696,37 +16708,37 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB55_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16842,37 +16854,39 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -16889,35 +16903,35 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4
-; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB56_1
@@ -16930,39 +16944,41 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -16977,34 +16993,34 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_sub_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB56_1
@@ -17018,35 +17034,35 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4
-; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB56_1
@@ -17060,33 +17076,33 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5
-; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3
-; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB56_1
@@ -17102,34 +17118,34 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5
-; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB56_1
@@ -17241,39 +17257,39 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -17287,38 +17303,38 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX940-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6
-; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB57_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17328,41 +17344,41 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX11-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -17375,35 +17391,35 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX10-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB57_1
@@ -17414,38 +17430,38 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX90A-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB57_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17455,36 +17471,36 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX908-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB57_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17496,37 +17512,37 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB57_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
index dde6671178af71..b9592a9ff9073a 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
@@ -56,19 +56,19 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr_d16_hi(ptr addrspace(1
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: s_movk_i32 s2, 0x100
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_load_ushort v0, v1, s[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: s_movk_i32 s1, 0x100
; GCN-NEXT: .LBB1_1: ; %bb3
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1]
-; GCN-NEXT: v_mov_b32_e32 v0, s1
-; GCN-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
-; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off glc
+; GCN-NEXT: v_lshlrev_b64 v[3:4], 1, v[0:1]
+; GCN-NEXT: v_add_co_u32_e32 v3, vcc, s0, v3
+; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc
+; GCN-NEXT: global_load_short_d16_hi v0, v[3:4], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s1, v0
; GCN-NEXT: s_cbranch_vccz .LBB1_1
; GCN-NEXT: ; %bb.2: ; %bb2
; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index 10d6a54baf665b..f7882e6f120222 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -2528,21 +2528,19 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: flat_load_dword v1, v[0:1]
+; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: .LBB55_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, s6, v1
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: v_not_b32_e32 v0, v0
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_and_b32_e32 v2, s6, v3
+; VI-NEXT: v_not_b32_e32 v2, v2
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; VI-NEXT: v_mov_b32_e32 v1, v0
+; VI-NEXT: v_mov_b32_e32 v3, v2
; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB55_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2623,25 +2621,23 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace
; VI-NEXT: s_addc_u32 s35, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s34
; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: flat_load_dword v1, v[0:1]
-; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: .LBB56_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, s6, v1
-; VI-NEXT: v_mov_b32_e32 v2, s34
-; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: v_not_b32_e32 v0, v0
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_and_b32_e32 v2, s6, v3
+; VI-NEXT: v_not_b32_e32 v2, v2
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB56_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[36:37]
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_nand_i32_noret_offset_scalar:
@@ -2718,19 +2714,19 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: .LBB57_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_and_b32_e32 v0, s6, v1
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: v_not_b32_e32 v0, v0
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_and_b32_e32 v0, s6, v4
+; VI-NEXT: v_not_b32_e32 v3, v0
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB57_1
@@ -2810,27 +2806,25 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 16
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: v_mov_b32_e32 v1, s34
+; VI-NEXT: v_mov_b32_e32 v2, s35
+; VI-NEXT: flat_load_dword v0, v[1:2]
+; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: .LBB58_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: v_mov_b32_e32 v2, s34
-; VI-NEXT: v_and_b32_e32 v0, s6, v1
-; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: v_not_b32_e32 v0, v0
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_and_b32_e32 v0, s6, v4
+; VI-NEXT: v_not_b32_e32 v3, v0
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB58_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[36:37]
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_nand_i32_ret_offset_scalar:
@@ -4304,20 +4298,18 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: flat_load_dword v1, v[0:1]
+; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: .LBB87_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_max_i32_e32 v0, s6, v1
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_max_i32_e32 v2, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; VI-NEXT: v_mov_b32_e32 v1, v0
+; VI-NEXT: v_mov_b32_e32 v3, v2
; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB87_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4396,24 +4388,22 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace(
; VI-NEXT: s_addc_u32 s35, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s34
; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: flat_load_dword v1, v[0:1]
-; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: .LBB88_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_max_i32_e32 v0, s6, v1
-; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_max_i32_e32 v2, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB88_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[36:37]
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_max_i32_noret_offset_scalar:
@@ -4488,18 +4478,18 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg %
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: .LBB89_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: v_max_i32_e32 v0, s6, v1
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_max_i32_e32 v3, s6, v4
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB89_1
@@ -4577,26 +4567,24 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 16
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: v_mov_b32_e32 v1, s34
+; VI-NEXT: v_mov_b32_e32 v2, s35
+; VI-NEXT: flat_load_dword v0, v[1:2]
+; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: .LBB90_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: v_mov_b32_e32 v2, s34
-; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: v_max_i32_e32 v0, s6, v1
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_max_i32_e32 v3, s6, v4
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB90_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[36:37]
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_max_i32_ret_offset_scalar:
@@ -4665,26 +4653,26 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i
; VI-NEXT: s_ashr_i32 s5, s3, 31
; VI-NEXT: s_mov_b32 s4, s3
; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
-; VI-NEXT: s_load_dword s3, s[0:1], 0x10
-; VI-NEXT: s_add_u32 s0, s0, 16
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_add_u32 s4, s0, s4
+; VI-NEXT: s_addc_u32 s5, s1, s5
+; VI-NEXT: s_load_dword s3, s[4:5], 0x10
+; VI-NEXT: s_add_u32 s4, s4, 16
+; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: .LBB91_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_max_i32_e32 v0, s2, v1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_max_i32_e32 v2, s2, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB91_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_endpgm
@@ -4771,32 +4759,32 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
; VI-NEXT: s_ashr_i32 s5, s7, 31
; VI-NEXT: s_mov_b32 s4, s7
; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
-; VI-NEXT: s_load_dword s7, s[0:1], 0x10
-; VI-NEXT: s_add_u32 s0, s0, 16
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_add_u32 s4, s0, s4
+; VI-NEXT: s_addc_u32 s5, s1, s5
+; VI-NEXT: s_load_dword s7, s[4:5], 0x10
+; VI-NEXT: s_add_u32 s4, s4, 16
+; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: .LBB92_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_max_i32_e32 v0, s6, v1
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: v_max_i32_e32 v2, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB92_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: flat_store_dword v[1:2], v0
+; VI-NEXT: s_or_b64 exec, exec, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i32_ret_addr64_offset:
@@ -4878,24 +4866,24 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in,
; VI-NEXT: s_ashr_i32 s5, s3, 31
; VI-NEXT: s_mov_b32 s4, s3
; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
-; VI-NEXT: s_load_dword s3, s[0:1], 0x0
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_add_u32 s4, s0, s4
+; VI-NEXT: s_addc_u32 s5, s1, s5
+; VI-NEXT: s_load_dword s3, s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_mov_b64 s[0:1], 0
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: .LBB93_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_max_i32_e32 v0, s2, v1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_max_i32_e32 v2, s2, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB93_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_endpgm
@@ -4981,30 +4969,30 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_ashr_i32 s5, s7, 31
; VI-NEXT: s_mov_b32 s4, s7
; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
-; VI-NEXT: s_load_dword s7, s[0:1], 0x0
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_add_u32 s4, s0, s4
+; VI-NEXT: s_addc_u32 s5, s1, s5
+; VI-NEXT: s_load_dword s7, s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_mov_b64 s[0:1], 0
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: .LBB94_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_max_i32_e32 v0, s6, v1
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: v_max_i32_e32 v2, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB94_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: flat_store_dword v[1:2], v0
+; VI-NEXT: s_or_b64 exec, exec, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i32_ret_addr64:
@@ -5563,20 +5551,18 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: flat_load_dword v1, v[0:1]
+; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: .LBB101_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_max_u32_e32 v0, s6, v1
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_max_u32_e32 v2, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; VI-NEXT: v_mov_b32_e32 v1, v0
+; VI-NEXT: v_mov_b32_e32 v3, v2
; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB101_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5655,24 +5641,22 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace
; VI-NEXT: s_addc_u32 s35, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s34
; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: flat_load_dword v1, v[0:1]
-; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: .LBB102_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_max_u32_e32 v0, s6, v1
-; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_max_u32_e32 v2, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB102_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[36:37]
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_umax_i32_noret_offset_scalar:
@@ -5747,18 +5731,18 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: .LBB103_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: v_max_u32_e32 v0, s6, v1
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_max_u32_e32 v3, s6, v4
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB103_1
@@ -5836,26 +5820,24 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 16
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: v_mov_b32_e32 v1, s34
+; VI-NEXT: v_mov_b32_e32 v2, s35
+; VI-NEXT: flat_load_dword v0, v[1:2]
+; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: .LBB104_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: v_mov_b32_e32 v2, s34
-; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: v_max_u32_e32 v0, s6, v1
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_max_u32_e32 v3, s6, v4
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB104_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[36:37]
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_umax_i32_ret_offset_scalar:
@@ -5924,26 +5906,26 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out,
; VI-NEXT: s_ashr_i32 s5, s3, 31
; VI-NEXT: s_mov_b32 s4, s3
; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
-; VI-NEXT: s_load_dword s3, s[0:1], 0x10
-; VI-NEXT: s_add_u32 s0, s0, 16
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_add_u32 s4, s0, s4
+; VI-NEXT: s_addc_u32 s5, s1, s5
+; VI-NEXT: s_load_dword s3, s[4:5], 0x10
+; VI-NEXT: s_add_u32 s4, s4, 16
+; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: .LBB105_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_max_u32_e32 v0, s2, v1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_max_u32_e32 v2, s2, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB105_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_endpgm
@@ -6030,32 +6012,32 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
; VI-NEXT: s_ashr_i32 s5, s7, 31
; VI-NEXT: s_mov_b32 s4, s7
; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
-; VI-NEXT: s_load_dword s7, s[0:1], 0x10
-; VI-NEXT: s_add_u32 s0, s0, 16
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_add_u32 s4, s0, s4
+; VI-NEXT: s_addc_u32 s5, s1, s5
+; VI-NEXT: s_load_dword s7, s[4:5], 0x10
+; VI-NEXT: s_add_u32 s4, s4, 16
+; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: .LBB106_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_max_u32_e32 v0, s6, v1
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: v_max_u32_e32 v2, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB106_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: flat_store_dword v[1:2], v0
+; VI-NEXT: s_or_b64 exec, exec, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset:
@@ -6145,30 +6127,30 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_ashr_i32 s5, s7, 31
; VI-NEXT: s_mov_b32 s4, s7
; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
-; VI-NEXT: s_load_dword s7, s[0:1], 0x0
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_add_u32 s4, s0, s4
+; VI-NEXT: s_addc_u32 s5, s1, s5
+; VI-NEXT: s_load_dword s7, s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_mov_b64 s[0:1], 0
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: .LBB107_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_max_u32_e32 v0, s6, v1
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: v_max_u32_e32 v2, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB107_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: flat_store_dword v[1:2], v0
+; VI-NEXT: s_or_b64 exec, exec, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i32_ret_addr64:
@@ -6727,20 +6709,18 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: flat_load_dword v1, v[0:1]
+; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: .LBB114_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_min_u32_e32 v0, s6, v1
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_min_u32_e32 v2, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; VI-NEXT: v_mov_b32_e32 v1, v0
+; VI-NEXT: v_mov_b32_e32 v3, v2
; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB114_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6819,24 +6799,22 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace
; VI-NEXT: s_addc_u32 s35, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s34
; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: flat_load_dword v1, v[0:1]
-; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: .LBB115_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_min_u32_e32 v0, s6, v1
-; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_min_u32_e32 v2, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB115_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[36:37]
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_umin_i32_noret_offset_scalar:
@@ -6911,18 +6889,18 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: .LBB116_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: v_min_u32_e32 v0, s6, v1
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_min_u32_e32 v3, s6, v4
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB116_1
@@ -7000,26 +6978,24 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 16
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: v_mov_b32_e32 v1, s34
+; VI-NEXT: v_mov_b32_e32 v2, s35
+; VI-NEXT: flat_load_dword v0, v[1:2]
+; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: .LBB117_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: v_mov_b32_e32 v2, s34
-; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: v_min_u32_e32 v0, s6, v1
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_min_u32_e32 v3, s6, v4
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB117_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[36:37]
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_umin_i32_ret_offset_scalar:
@@ -7566,20 +7542,18 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: flat_load_dword v1, v[0:1]
+; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: .LBB124_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_min_i32_e32 v0, s6, v1
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_min_i32_e32 v2, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; VI-NEXT: v_mov_b32_e32 v1, v0
+; VI-NEXT: v_mov_b32_e32 v3, v2
; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB124_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7658,24 +7632,22 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace(
; VI-NEXT: s_addc_u32 s35, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s34
; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: flat_load_dword v1, v[0:1]
-; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: .LBB125_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_min_i32_e32 v0, s6, v1
-; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_min_i32_e32 v2, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB125_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[36:37]
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_min_i32_noret_offset_scalar:
@@ -7750,18 +7722,18 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg %
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: .LBB126_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: v_min_i32_e32 v0, s6, v1
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_min_i32_e32 v3, s6, v4
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB126_1
@@ -7839,26 +7811,24 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 16
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: v_mov_b32_e32 v1, s34
+; VI-NEXT: v_mov_b32_e32 v2, s35
+; VI-NEXT: flat_load_dword v0, v[1:2]
+; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: .LBB127_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: v_mov_b32_e32 v2, s34
-; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: v_min_i32_e32 v0, s6, v1
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_min_i32_e32 v3, s6, v4
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB127_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[36:37]
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_min_i32_ret_offset_scalar:
@@ -7927,26 +7897,26 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i
; VI-NEXT: s_ashr_i32 s5, s3, 31
; VI-NEXT: s_mov_b32 s4, s3
; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
-; VI-NEXT: s_load_dword s3, s[0:1], 0x10
-; VI-NEXT: s_add_u32 s0, s0, 16
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_add_u32 s4, s0, s4
+; VI-NEXT: s_addc_u32 s5, s1, s5
+; VI-NEXT: s_load_dword s3, s[4:5], 0x10
+; VI-NEXT: s_add_u32 s4, s4, 16
+; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: .LBB128_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_min_i32_e32 v0, s2, v1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_min_i32_e32 v2, s2, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB128_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_endpgm
@@ -8033,32 +8003,32 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
; VI-NEXT: s_ashr_i32 s5, s7, 31
; VI-NEXT: s_mov_b32 s4, s7
; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
-; VI-NEXT: s_load_dword s7, s[0:1], 0x10
-; VI-NEXT: s_add_u32 s0, s0, 16
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_add_u32 s4, s0, s4
+; VI-NEXT: s_addc_u32 s5, s1, s5
+; VI-NEXT: s_load_dword s7, s[4:5], 0x10
+; VI-NEXT: s_add_u32 s4, s4, 16
+; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: .LBB129_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_min_i32_e32 v0, s6, v1
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: v_min_i32_e32 v2, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB129_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: flat_store_dword v[1:2], v0
+; VI-NEXT: s_or_b64 exec, exec, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i32_ret_addr64_offset:
@@ -8131,25 +8101,25 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_min_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT: s_load_dword s4, s[4:5], 0x2c
-; VI-NEXT: s_mov_b64 s[2:3], 0
+; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s5, s[0:1], 0x0
+; VI-NEXT: s_load_dword s3, s[6:7], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: .LBB130_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_min_i32_e32 v0, s4, v1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_min_i32_e32 v2, s2, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB130_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_endpgm
@@ -8230,30 +8200,30 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_ashr_i32 s5, s7, 31
; VI-NEXT: s_mov_b32 s4, s7
; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
-; VI-NEXT: s_load_dword s7, s[0:1], 0x0
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_add_u32 s4, s0, s4
+; VI-NEXT: s_addc_u32 s5, s1, s5
+; VI-NEXT: s_load_dword s7, s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_mov_b64 s[0:1], 0
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: .LBB131_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_min_i32_e32 v0, s6, v1
-; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: v_min_i32_e32 v2, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB131_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: flat_store_dword v[1:2], v0
+; VI-NEXT: s_or_b64 exec, exec, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i32_ret_addr64:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index 645bb0b117ccb0..59a99a6a0328d4 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -2626,14 +2626,14 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: .LBB54_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v0, s7, v3
; VI-NEXT: v_and_b32_e32 v6, s6, v2
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_not_b32_e32 v1, v0
; VI-NEXT: v_not_b32_e32 v0, v6
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -2730,17 +2730,15 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: v_mov_b32_e32 v4, s34
+; VI-NEXT: v_mov_b32_e32 v5, s35
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: .LBB55_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v0, s7, v3
; VI-NEXT: v_and_b32_e32 v6, s6, v2
-; VI-NEXT: v_mov_b32_e32 v4, s34
-; VI-NEXT: v_mov_b32_e32 v5, s35
; VI-NEXT: v_not_b32_e32 v1, v0
; VI-NEXT: v_not_b32_e32 v0, v6
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -2748,12 +2746,12 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB55_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[36:37]
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_nand_i64_noret_offset_scalar:
@@ -2839,22 +2837,22 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: .LBB56_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_and_b32_e32 v0, s7, v3
-; VI-NEXT: v_and_b32_e32 v6, s6, v2
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_not_b32_e32 v1, v0
-; VI-NEXT: v_not_b32_e32 v0, v6
-; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT: v_mov_b32_e32 v7, v1
+; VI-NEXT: v_mov_b32_e32 v6, v0
+; VI-NEXT: v_and_b32_e32 v0, s7, v7
+; VI-NEXT: v_and_b32_e32 v1, s6, v6
+; VI-NEXT: v_not_b32_e32 v5, v0
+; VI-NEXT: v_not_b32_e32 v4, v1
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB56_1
@@ -2943,30 +2941,28 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: v_mov_b32_e32 v2, s34
+; VI-NEXT: v_mov_b32_e32 v3, s35
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: .LBB57_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: v_mov_b32_e32 v4, s34
-; VI-NEXT: v_and_b32_e32 v0, s7, v3
-; VI-NEXT: v_and_b32_e32 v6, s6, v2
-; VI-NEXT: v_mov_b32_e32 v5, s35
-; VI-NEXT: v_not_b32_e32 v1, v0
-; VI-NEXT: v_not_b32_e32 v0, v6
-; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT: v_mov_b32_e32 v7, v1
+; VI-NEXT: v_mov_b32_e32 v6, v0
+; VI-NEXT: v_and_b32_e32 v0, s7, v7
+; VI-NEXT: v_and_b32_e32 v1, s6, v6
+; VI-NEXT: v_not_b32_e32 v5, v0
+; VI-NEXT: v_not_b32_e32 v4, v1
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB57_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[36:37]
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_nand_i64_ret_offset_scalar:
@@ -4438,45 +4434,45 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v8, s6, 0
-; SI-NEXT: v_writelane_b32 v8, s7, 1
+; SI-NEXT: v_writelane_b32 v10, s6, 0
+; SI-NEXT: v_writelane_b32 v10, s7, 1
; SI-NEXT: s_mov_b32 s35, s7
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB84_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_mov_b32_e32 v0, s35
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s34
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v9, v3
+; SI-NEXT: v_mov_b32_e32 v8, v2
+; SI-NEXT: v_mov_b32_e32 v7, v1
+; SI-NEXT: v_mov_b32_e32 v6, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: v_mov_b32_e32 v2, v6
+; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB84_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v8, 1
-; SI-NEXT: v_readlane_b32 s6, v8, 0
+; SI-NEXT: v_readlane_b32 s7, v10, 1
+; SI-NEXT: v_readlane_b32 s6, v10, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -4487,17 +4483,17 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v6, s7
+; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: .LBB84_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: v_mov_b32_e32 v6, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -4517,14 +4513,14 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-NEXT: v_mov_b32_e32 v6, s6
; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v5, s6
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -4546,45 +4542,45 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace(
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v8, s6, 0
-; SI-NEXT: v_writelane_b32 v8, s7, 1
+; SI-NEXT: v_writelane_b32 v10, s6, 0
+; SI-NEXT: v_writelane_b32 v10, s7, 1
; SI-NEXT: s_mov_b32 s35, s7
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB85_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_mov_b32_e32 v0, s35
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s34
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v9, v3
+; SI-NEXT: v_mov_b32_e32 v8, v2
+; SI-NEXT: v_mov_b32_e32 v7, v1
+; SI-NEXT: v_mov_b32_e32 v6, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: v_mov_b32_e32 v2, v6
+; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB85_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v8, 1
-; SI-NEXT: v_readlane_b32 s6, v8, 0
+; SI-NEXT: v_readlane_b32 s7, v10, 1
+; SI-NEXT: v_readlane_b32 s6, v10, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -4594,31 +4590,29 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace(
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: v_mov_b32_e32 v4, s34
+; VI-NEXT: v_mov_b32_e32 v5, s35
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v6, s7
+; VI-NEXT: v_mov_b32_e32 v7, s6
; VI-NEXT: .LBB85_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: v_mov_b32_e32 v6, s6
-; VI-NEXT: v_mov_b32_e32 v4, s34
-; VI-NEXT: v_mov_b32_e32 v5, s35
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB85_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[36:37]
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_max_i64_noret_offset_scalar:
@@ -4627,14 +4621,14 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace(
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-NEXT: v_mov_b32_e32 v6, s6
; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v5, s6
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -4657,45 +4651,45 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg %
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v6, s6, 0
-; SI-NEXT: v_writelane_b32 v6, s7, 1
+; SI-NEXT: v_writelane_b32 v10, s6, 0
+; SI-NEXT: v_writelane_b32 v10, s7, 1
; SI-NEXT: s_mov_b32 s35, s7
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB86_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s35
-; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[4:5]
-; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s34
-; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: v_mov_b32_e32 v1, v3
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB86_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v6, 1
-; SI-NEXT: v_readlane_b32 s6, v6, 0
+; SI-NEXT: v_readlane_b32 s7, v10, 1
+; SI-NEXT: v_readlane_b32 s6, v10, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -4706,23 +4700,23 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg %
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v4, s7
+; VI-NEXT: v_mov_b32_e32 v5, s6
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: .LBB86_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: v_mov_b32_e32 v6, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT: v_mov_b32_e32 v9, v1
+; VI-NEXT: v_mov_b32_e32 v8, v0
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB86_1
@@ -4736,20 +4730,20 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg %
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_mov_b32_e32 v4, s6
; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[5:6]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc
+; GFX9-NEXT: v_mov_b32_e32 v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, v0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[7:8]
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_cbranch_execnz .LBB86_1
@@ -4765,45 +4759,45 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1)
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v6, s6, 0
-; SI-NEXT: v_writelane_b32 v6, s7, 1
+; SI-NEXT: v_writelane_b32 v10, s6, 0
+; SI-NEXT: v_writelane_b32 v10, s7, 1
; SI-NEXT: s_mov_b32 s35, s7
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB87_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s35
-; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[4:5]
-; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s34
-; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: v_mov_b32_e32 v1, v3
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB87_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v6, 1
-; SI-NEXT: v_readlane_b32 s6, v6, 0
+; SI-NEXT: v_readlane_b32 s7, v10, 1
+; SI-NEXT: v_readlane_b32 s6, v10, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -4813,31 +4807,29 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: v_mov_b32_e32 v2, s34
+; VI-NEXT: v_mov_b32_e32 v3, s35
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v4, s7
+; VI-NEXT: v_mov_b32_e32 v5, s6
; VI-NEXT: .LBB87_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: v_mov_b32_e32 v6, s6
-; VI-NEXT: v_mov_b32_e32 v4, s34
-; VI-NEXT: v_mov_b32_e32 v5, s35
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT: v_mov_b32_e32 v9, v1
+; VI-NEXT: v_mov_b32_e32 v8, v0
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB87_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[36:37]
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_max_i64_ret_offset_scalar:
@@ -4846,20 +4838,20 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1)
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_mov_b32_e32 v4, s6
; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[5:6]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc
+; GFX9-NEXT: v_mov_b32_e32 v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, v0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[7:8]
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_cbranch_execnz .LBB87_1
@@ -4883,29 +4875,29 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: v_mov_b32_e32 v4, s3
+; SI-NEXT: v_mov_b32_e32 v5, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s8
; SI-NEXT: v_mov_b32_e32 v3, s9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: .LBB88_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v9, v3
+; SI-NEXT: v_mov_b32_e32 v8, v2
+; SI-NEXT: v_mov_b32_e32 v7, v1
+; SI-NEXT: v_mov_b32_e32 v6, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: v_mov_b32_e32 v2, v6
+; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB88_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4915,26 +4907,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_mov_b64 s[4:5], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; VI-NEXT: s_add_u32 s0, s0, s6
+; VI-NEXT: s_addc_u32 s1, s1, s7
; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v6, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v7, s2
; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: .LBB88_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -4951,24 +4943,24 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX9-NEXT: s_add_u32 s0, s0, s4
; GFX9-NEXT: s_addc_u32 s1, s1, s5
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s3
+; GFX9-NEXT: v_mov_b32_e32 v5, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v5, s2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -4997,17 +4989,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: v_mov_b32_e32 v8, s5
+; SI-NEXT: v_mov_b32_e32 v9, s4
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s6
; SI-NEXT: v_mov_b32_e32 v3, s7
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: .LBB89_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v7, v3
; SI-NEXT: v_mov_b32_e32 v6, v2
@@ -5034,68 +5026,68 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
; VI-LABEL: atomic_max_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT: s_mov_b64 s[8:9], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; VI-NEXT: s_add_u32 s0, s0, s6
; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: s_mov_b64 s[6:7], 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v4, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v5, s4
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: .LBB89_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s5
-; VI-NEXT: v_mov_b32_e32 v6, s4
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT: v_mov_b32_e32 v9, v3
+; VI-NEXT: v_mov_b32_e32 v8, v2
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; VI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; VI-NEXT: s_cbranch_execnz .LBB89_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[6:7]
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_or_b64 exec, exec, s[8:9]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
; GFX9-NEXT: s_add_u32 s0, s8, s0
; GFX9-NEXT: s_addc_u32 s1, s9, s1
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20
+; GFX9-NEXT: v_mov_b32_e32 v2, s13
+; GFX9-NEXT: v_mov_b32_e32 v3, s12
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[5:6]
-; GFX9-NEXT: v_mov_b32_e32 v0, s13
-; GFX9-NEXT: v_mov_b32_e32 v1, s12
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] offset:32 glc
+; GFX9-NEXT: v_mov_b32_e32 v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, v0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[7:8]
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB89_1
@@ -5124,29 +5116,29 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: v_mov_b32_e32 v4, s3
+; SI-NEXT: v_mov_b32_e32 v5, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s8
; SI-NEXT: v_mov_b32_e32 v3, s9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: .LBB90_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v9, v3
+; SI-NEXT: v_mov_b32_e32 v8, v2
+; SI-NEXT: v_mov_b32_e32 v7, v1
+; SI-NEXT: v_mov_b32_e32 v6, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: v_mov_b32_e32 v2, v6
+; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB90_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5158,30 +5150,30 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_add_u32 s4, s0, s4
+; VI-NEXT: s_addc_u32 s5, s1, s5
+; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: s_mov_b64 s[0:1], 0
+; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: v_mov_b32_e32 v7, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: .LBB90_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB90_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_endpgm
@@ -5190,24 +5182,24 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX9-NEXT: s_add_u32 s0, s0, s4
; GFX9-NEXT: s_addc_u32 s1, s1, s5
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s3
+; GFX9-NEXT: v_mov_b32_e32 v5, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v5, s2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -5235,17 +5227,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: v_mov_b32_e32 v8, s5
+; SI-NEXT: v_mov_b32_e32 v9, s4
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s6
; SI-NEXT: v_mov_b32_e32 v3, s7
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: .LBB91_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v7, v3
; SI-NEXT: v_mov_b32_e32 v6, v2
@@ -5274,64 +5266,64 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
-; VI-NEXT: s_mov_b64 s[6:7], 0
+; VI-NEXT: s_add_u32 s6, s0, s6
+; VI-NEXT: s_addc_u32 s7, s1, s7
+; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_mov_b64 s[0:1], 0
+; VI-NEXT: v_mov_b32_e32 v4, s5
+; VI-NEXT: v_mov_b32_e32 v5, s4
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: v_mov_b32_e32 v3, s9
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: .LBB91_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s5
-; VI-NEXT: v_mov_b32_e32 v6, s4
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT: v_mov_b32_e32 v9, v3
+; VI-NEXT: v_mov_b32_e32 v8, v2
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB91_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[6:7]
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_or_b64 exec, exec, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
; GFX9-NEXT: s_add_u32 s0, s8, s0
; GFX9-NEXT: s_addc_u32 s1, s9, s1
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, s13
+; GFX9-NEXT: v_mov_b32_e32 v3, s12
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[5:6]
-; GFX9-NEXT: v_mov_b32_e32 v0, s13
-; GFX9-NEXT: v_mov_b32_e32 v1, s12
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] glc
+; GFX9-NEXT: v_mov_b32_e32 v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, v0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[7:8]
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB91_1
@@ -5904,45 +5896,45 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v8, s6, 0
-; SI-NEXT: v_writelane_b32 v8, s7, 1
+; SI-NEXT: v_writelane_b32 v10, s6, 0
+; SI-NEXT: v_writelane_b32 v10, s7, 1
; SI-NEXT: s_mov_b32 s35, s7
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB98_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_mov_b32_e32 v0, s35
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s34
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v9, v3
+; SI-NEXT: v_mov_b32_e32 v8, v2
+; SI-NEXT: v_mov_b32_e32 v7, v1
+; SI-NEXT: v_mov_b32_e32 v6, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: v_mov_b32_e32 v2, v6
+; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB98_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v8, 1
-; SI-NEXT: v_readlane_b32 s6, v8, 0
+; SI-NEXT: v_readlane_b32 s7, v10, 1
+; SI-NEXT: v_readlane_b32 s6, v10, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -5953,17 +5945,17 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v6, s7
+; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: .LBB98_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: v_mov_b32_e32 v6, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -5983,14 +5975,14 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-NEXT: v_mov_b32_e32 v6, s6
; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v5, s6
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -6012,45 +6004,45 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v8, s6, 0
-; SI-NEXT: v_writelane_b32 v8, s7, 1
+; SI-NEXT: v_writelane_b32 v10, s6, 0
+; SI-NEXT: v_writelane_b32 v10, s7, 1
; SI-NEXT: s_mov_b32 s35, s7
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB99_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_mov_b32_e32 v0, s35
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s34
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v9, v3
+; SI-NEXT: v_mov_b32_e32 v8, v2
+; SI-NEXT: v_mov_b32_e32 v7, v1
+; SI-NEXT: v_mov_b32_e32 v6, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: v_mov_b32_e32 v2, v6
+; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB99_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v8, 1
-; SI-NEXT: v_readlane_b32 s6, v8, 0
+; SI-NEXT: v_readlane_b32 s7, v10, 1
+; SI-NEXT: v_readlane_b32 s6, v10, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -6060,31 +6052,29 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: v_mov_b32_e32 v4, s34
+; VI-NEXT: v_mov_b32_e32 v5, s35
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v6, s7
+; VI-NEXT: v_mov_b32_e32 v7, s6
; VI-NEXT: .LBB99_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: v_mov_b32_e32 v6, s6
-; VI-NEXT: v_mov_b32_e32 v4, s34
-; VI-NEXT: v_mov_b32_e32 v5, s35
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB99_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[36:37]
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_umax_i64_noret_offset_scalar:
@@ -6093,14 +6083,14 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-NEXT: v_mov_b32_e32 v6, s6
; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v5, s6
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -6123,45 +6113,45 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v6, s6, 0
-; SI-NEXT: v_writelane_b32 v6, s7, 1
+; SI-NEXT: v_writelane_b32 v10, s6, 0
+; SI-NEXT: v_writelane_b32 v10, s7, 1
; SI-NEXT: s_mov_b32 s35, s7
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB100_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s35
-; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[4:5]
-; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s34
-; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: v_mov_b32_e32 v1, v3
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB100_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v6, 1
-; SI-NEXT: v_readlane_b32 s6, v6, 0
+; SI-NEXT: v_readlane_b32 s7, v10, 1
+; SI-NEXT: v_readlane_b32 s6, v10, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -6172,23 +6162,23 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v4, s7
+; VI-NEXT: v_mov_b32_e32 v5, s6
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: .LBB100_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: v_mov_b32_e32 v6, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT: v_mov_b32_e32 v9, v1
+; VI-NEXT: v_mov_b32_e32 v8, v0
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB100_1
@@ -6202,20 +6192,20 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_mov_b32_e32 v4, s6
; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc
+; GFX9-NEXT: v_mov_b32_e32 v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, v0
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[7:8]
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_cbranch_execnz .LBB100_1
@@ -6231,45 +6221,45 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1)
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v6, s6, 0
-; SI-NEXT: v_writelane_b32 v6, s7, 1
+; SI-NEXT: v_writelane_b32 v10, s6, 0
+; SI-NEXT: v_writelane_b32 v10, s7, 1
; SI-NEXT: s_mov_b32 s35, s7
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB101_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s35
-; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[4:5]
-; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s34
-; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: v_mov_b32_e32 v1, v3
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB101_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v6, 1
-; SI-NEXT: v_readlane_b32 s6, v6, 0
+; SI-NEXT: v_readlane_b32 s7, v10, 1
+; SI-NEXT: v_readlane_b32 s6, v10, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -6279,31 +6269,29 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: v_mov_b32_e32 v2, s34
+; VI-NEXT: v_mov_b32_e32 v3, s35
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v4, s7
+; VI-NEXT: v_mov_b32_e32 v5, s6
; VI-NEXT: .LBB101_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: v_mov_b32_e32 v6, s6
-; VI-NEXT: v_mov_b32_e32 v4, s34
-; VI-NEXT: v_mov_b32_e32 v5, s35
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT: v_mov_b32_e32 v9, v1
+; VI-NEXT: v_mov_b32_e32 v8, v0
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB101_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[36:37]
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_umax_i64_ret_offset_scalar:
@@ -6312,20 +6300,20 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1)
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_mov_b32_e32 v4, s6
; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc
+; GFX9-NEXT: v_mov_b32_e32 v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, v0
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[7:8]
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_cbranch_execnz .LBB101_1
@@ -6349,29 +6337,29 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: v_mov_b32_e32 v4, s3
+; SI-NEXT: v_mov_b32_e32 v5, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s8
; SI-NEXT: v_mov_b32_e32 v3, s9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: .LBB102_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v9, v3
+; SI-NEXT: v_mov_b32_e32 v8, v2
+; SI-NEXT: v_mov_b32_e32 v7, v1
+; SI-NEXT: v_mov_b32_e32 v6, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: v_mov_b32_e32 v2, v6
+; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB102_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6381,26 +6369,26 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_mov_b64 s[4:5], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; VI-NEXT: s_add_u32 s0, s0, s6
+; VI-NEXT: s_addc_u32 s1, s1, s7
; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v6, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v7, s2
; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: .LBB102_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -6417,24 +6405,24 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX9-NEXT: s_add_u32 s0, s0, s4
; GFX9-NEXT: s_addc_u32 s1, s1, s5
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s3
+; GFX9-NEXT: v_mov_b32_e32 v5, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v5, s2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -6463,17 +6451,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: v_mov_b32_e32 v8, s5
+; SI-NEXT: v_mov_b32_e32 v9, s4
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s6
; SI-NEXT: v_mov_b32_e32 v3, s7
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: .LBB103_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v7, v3
; SI-NEXT: v_mov_b32_e32 v6, v2
@@ -6500,68 +6488,68 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
; VI-LABEL: atomic_umax_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT: s_mov_b64 s[8:9], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; VI-NEXT: s_add_u32 s0, s0, s6
; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: s_mov_b64 s[6:7], 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v4, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v5, s4
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: .LBB103_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s5
-; VI-NEXT: v_mov_b32_e32 v6, s4
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT: v_mov_b32_e32 v9, v3
+; VI-NEXT: v_mov_b32_e32 v8, v2
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; VI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; VI-NEXT: s_cbranch_execnz .LBB103_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[6:7]
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_or_b64 exec, exec, s[8:9]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
; GFX9-NEXT: s_add_u32 s0, s8, s0
; GFX9-NEXT: s_addc_u32 s1, s9, s1
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20
+; GFX9-NEXT: v_mov_b32_e32 v2, s13
+; GFX9-NEXT: v_mov_b32_e32 v3, s12
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[5:6]
-; GFX9-NEXT: v_mov_b32_e32 v0, s13
-; GFX9-NEXT: v_mov_b32_e32 v1, s12
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] offset:32 glc
+; GFX9-NEXT: v_mov_b32_e32 v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, v0
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[7:8]
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB103_1
@@ -6589,17 +6577,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: v_mov_b32_e32 v8, s5
+; SI-NEXT: v_mov_b32_e32 v9, s4
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s6
; SI-NEXT: v_mov_b32_e32 v3, s7
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: .LBB104_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v7, v3
; SI-NEXT: v_mov_b32_e32 v6, v2
@@ -6628,64 +6616,64 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
-; VI-NEXT: s_mov_b64 s[6:7], 0
+; VI-NEXT: s_add_u32 s6, s0, s6
+; VI-NEXT: s_addc_u32 s7, s1, s7
+; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_mov_b64 s[0:1], 0
+; VI-NEXT: v_mov_b32_e32 v4, s5
+; VI-NEXT: v_mov_b32_e32 v5, s4
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: v_mov_b32_e32 v3, s9
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: .LBB104_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s5
-; VI-NEXT: v_mov_b32_e32 v6, s4
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT: v_mov_b32_e32 v9, v3
+; VI-NEXT: v_mov_b32_e32 v8, v2
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB104_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[6:7]
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_or_b64 exec, exec, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
; GFX9-NEXT: s_add_u32 s0, s8, s0
; GFX9-NEXT: s_addc_u32 s1, s9, s1
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, s13
+; GFX9-NEXT: v_mov_b32_e32 v3, s12
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[5:6]
-; GFX9-NEXT: v_mov_b32_e32 v0, s13
-; GFX9-NEXT: v_mov_b32_e32 v1, s12
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] glc
+; GFX9-NEXT: v_mov_b32_e32 v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, v0
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[7:8]
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB104_1
@@ -7258,45 +7246,45 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v8, s6, 0
-; SI-NEXT: v_writelane_b32 v8, s7, 1
+; SI-NEXT: v_writelane_b32 v10, s6, 0
+; SI-NEXT: v_writelane_b32 v10, s7, 1
; SI-NEXT: s_mov_b32 s35, s7
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB111_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_mov_b32_e32 v0, s35
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s34
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v9, v3
+; SI-NEXT: v_mov_b32_e32 v8, v2
+; SI-NEXT: v_mov_b32_e32 v7, v1
+; SI-NEXT: v_mov_b32_e32 v6, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: v_mov_b32_e32 v2, v6
+; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB111_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v8, 1
-; SI-NEXT: v_readlane_b32 s6, v8, 0
+; SI-NEXT: v_readlane_b32 s7, v10, 1
+; SI-NEXT: v_readlane_b32 s6, v10, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -7307,17 +7295,17 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v6, s7
+; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: .LBB111_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: v_mov_b32_e32 v6, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -7337,14 +7325,14 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-NEXT: v_mov_b32_e32 v6, s6
; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v5, s6
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -7366,45 +7354,45 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v8, s6, 0
-; SI-NEXT: v_writelane_b32 v8, s7, 1
+; SI-NEXT: v_writelane_b32 v10, s6, 0
+; SI-NEXT: v_writelane_b32 v10, s7, 1
; SI-NEXT: s_mov_b32 s35, s7
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB112_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_mov_b32_e32 v0, s35
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s34
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v9, v3
+; SI-NEXT: v_mov_b32_e32 v8, v2
+; SI-NEXT: v_mov_b32_e32 v7, v1
+; SI-NEXT: v_mov_b32_e32 v6, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: v_mov_b32_e32 v2, v6
+; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB112_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v8, 1
-; SI-NEXT: v_readlane_b32 s6, v8, 0
+; SI-NEXT: v_readlane_b32 s7, v10, 1
+; SI-NEXT: v_readlane_b32 s6, v10, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -7414,31 +7402,29 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: v_mov_b32_e32 v4, s34
+; VI-NEXT: v_mov_b32_e32 v5, s35
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v6, s7
+; VI-NEXT: v_mov_b32_e32 v7, s6
; VI-NEXT: .LBB112_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: v_mov_b32_e32 v6, s6
-; VI-NEXT: v_mov_b32_e32 v4, s34
-; VI-NEXT: v_mov_b32_e32 v5, s35
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB112_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[36:37]
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_umin_i64_noret_offset_scalar:
@@ -7447,14 +7433,14 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-NEXT: v_mov_b32_e32 v6, s6
; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v5, s6
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -7477,45 +7463,45 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v6, s6, 0
-; SI-NEXT: v_writelane_b32 v6, s7, 1
+; SI-NEXT: v_writelane_b32 v10, s6, 0
+; SI-NEXT: v_writelane_b32 v10, s7, 1
; SI-NEXT: s_mov_b32 s35, s7
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB113_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s35
-; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[4:5]
-; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s34
-; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: v_mov_b32_e32 v1, v3
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB113_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v6, 1
-; SI-NEXT: v_readlane_b32 s6, v6, 0
+; SI-NEXT: v_readlane_b32 s7, v10, 1
+; SI-NEXT: v_readlane_b32 s6, v10, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -7526,23 +7512,23 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v4, s7
+; VI-NEXT: v_mov_b32_e32 v5, s6
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: .LBB113_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: v_mov_b32_e32 v6, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT: v_mov_b32_e32 v9, v1
+; VI-NEXT: v_mov_b32_e32 v8, v0
+; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB113_1
@@ -7556,20 +7542,20 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_mov_b32_e32 v4, s6
; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[5:6]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc
+; GFX9-NEXT: v_mov_b32_e32 v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, v0
+; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[7:8]
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_cbranch_execnz .LBB113_1
@@ -7585,45 +7571,45 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1)
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v6, s6, 0
-; SI-NEXT: v_writelane_b32 v6, s7, 1
+; SI-NEXT: v_writelane_b32 v10, s6, 0
+; SI-NEXT: v_writelane_b32 v10, s7, 1
; SI-NEXT: s_mov_b32 s35, s7
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB114_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s35
-; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[4:5]
-; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s34
-; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: v_mov_b32_e32 v1, v3
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB114_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v6, 1
-; SI-NEXT: v_readlane_b32 s6, v6, 0
+; SI-NEXT: v_readlane_b32 s7, v10, 1
+; SI-NEXT: v_readlane_b32 s6, v10, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -7633,31 +7619,29 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: v_mov_b32_e32 v2, s34
+; VI-NEXT: v_mov_b32_e32 v3, s35
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v4, s7
+; VI-NEXT: v_mov_b32_e32 v5, s6
; VI-NEXT: .LBB114_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: v_mov_b32_e32 v6, s6
-; VI-NEXT: v_mov_b32_e32 v4, s34
-; VI-NEXT: v_mov_b32_e32 v5, s35
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT: v_mov_b32_e32 v9, v1
+; VI-NEXT: v_mov_b32_e32 v8, v0
+; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB114_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[36:37]
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_umin_i64_ret_offset_scalar:
@@ -7666,20 +7650,20 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1)
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_mov_b32_e32 v4, s6
; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[5:6]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc
+; GFX9-NEXT: v_mov_b32_e32 v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, v0
+; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[7:8]
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_cbranch_execnz .LBB114_1
@@ -8248,45 +8232,45 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v8, s6, 0
-; SI-NEXT: v_writelane_b32 v8, s7, 1
+; SI-NEXT: v_writelane_b32 v10, s6, 0
+; SI-NEXT: v_writelane_b32 v10, s7, 1
; SI-NEXT: s_mov_b32 s35, s7
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB121_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_mov_b32_e32 v0, s35
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s34
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v9, v3
+; SI-NEXT: v_mov_b32_e32 v8, v2
+; SI-NEXT: v_mov_b32_e32 v7, v1
+; SI-NEXT: v_mov_b32_e32 v6, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: v_mov_b32_e32 v2, v6
+; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB121_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v8, 1
-; SI-NEXT: v_readlane_b32 s6, v8, 0
+; SI-NEXT: v_readlane_b32 s7, v10, 1
+; SI-NEXT: v_readlane_b32 s6, v10, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -8297,17 +8281,17 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v6, s7
+; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: .LBB121_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: v_mov_b32_e32 v6, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -8327,14 +8311,14 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-NEXT: v_mov_b32_e32 v6, s6
; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v5, s6
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -8356,45 +8340,45 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace(
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v8, s6, 0
-; SI-NEXT: v_writelane_b32 v8, s7, 1
+; SI-NEXT: v_writelane_b32 v10, s6, 0
+; SI-NEXT: v_writelane_b32 v10, s7, 1
; SI-NEXT: s_mov_b32 s35, s7
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB122_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_mov_b32_e32 v0, s35
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s34
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v9, v3
+; SI-NEXT: v_mov_b32_e32 v8, v2
+; SI-NEXT: v_mov_b32_e32 v7, v1
+; SI-NEXT: v_mov_b32_e32 v6, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: v_mov_b32_e32 v2, v6
+; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB122_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v8, 1
-; SI-NEXT: v_readlane_b32 s6, v8, 0
+; SI-NEXT: v_readlane_b32 s7, v10, 1
+; SI-NEXT: v_readlane_b32 s6, v10, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -8404,31 +8388,29 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace(
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: v_mov_b32_e32 v4, s34
+; VI-NEXT: v_mov_b32_e32 v5, s35
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v6, s7
+; VI-NEXT: v_mov_b32_e32 v7, s6
; VI-NEXT: .LBB122_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: v_mov_b32_e32 v6, s6
-; VI-NEXT: v_mov_b32_e32 v4, s34
-; VI-NEXT: v_mov_b32_e32 v5, s35
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB122_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[36:37]
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_min_i64_noret_offset_scalar:
@@ -8437,14 +8419,14 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace(
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-NEXT: v_mov_b32_e32 v6, s6
; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v5, s6
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -8467,45 +8449,45 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg %
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v6, s6, 0
-; SI-NEXT: v_writelane_b32 v6, s7, 1
+; SI-NEXT: v_writelane_b32 v10, s6, 0
+; SI-NEXT: v_writelane_b32 v10, s7, 1
; SI-NEXT: s_mov_b32 s35, s7
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB123_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s35
-; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[4:5]
-; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s34
-; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: v_mov_b32_e32 v1, v3
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB123_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v6, 1
-; SI-NEXT: v_readlane_b32 s6, v6, 0
+; SI-NEXT: v_readlane_b32 s7, v10, 1
+; SI-NEXT: v_readlane_b32 s6, v10, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -8516,23 +8498,23 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg %
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v4, s7
+; VI-NEXT: v_mov_b32_e32 v5, s6
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: .LBB123_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: v_mov_b32_e32 v6, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT: v_mov_b32_e32 v9, v1
+; VI-NEXT: v_mov_b32_e32 v8, v0
+; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB123_1
@@ -8546,20 +8528,20 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg %
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_mov_b32_e32 v4, s6
; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[5:6]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc
+; GFX9-NEXT: v_mov_b32_e32 v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, v0
+; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[7:8]
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_cbranch_execnz .LBB123_1
@@ -8575,45 +8557,45 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1)
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v6, s6, 0
-; SI-NEXT: v_writelane_b32 v6, s7, 1
+; SI-NEXT: v_writelane_b32 v10, s6, 0
+; SI-NEXT: v_writelane_b32 v10, s7, 1
; SI-NEXT: s_mov_b32 s35, s7
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB124_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s35
-; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[4:5]
-; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s34
-; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: v_mov_b32_e32 v1, v3
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB124_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v6, 1
-; SI-NEXT: v_readlane_b32 s6, v6, 0
+; SI-NEXT: v_readlane_b32 s7, v10, 1
+; SI-NEXT: v_readlane_b32 s6, v10, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -8623,31 +8605,29 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: v_mov_b32_e32 v2, s34
+; VI-NEXT: v_mov_b32_e32 v3, s35
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v4, s7
+; VI-NEXT: v_mov_b32_e32 v5, s6
; VI-NEXT: .LBB124_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: v_mov_b32_e32 v6, s6
-; VI-NEXT: v_mov_b32_e32 v4, s34
-; VI-NEXT: v_mov_b32_e32 v5, s35
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT: v_mov_b32_e32 v9, v1
+; VI-NEXT: v_mov_b32_e32 v8, v0
+; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
; VI-NEXT: s_cbranch_execnz .LBB124_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[36:37]
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_min_i64_ret_offset_scalar:
@@ -8656,20 +8636,20 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1)
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_mov_b32_e32 v4, s6
; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[5:6]
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc
+; GFX9-NEXT: v_mov_b32_e32 v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, v0
+; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[7:8]
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_cbranch_execnz .LBB124_1
@@ -8693,29 +8673,29 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: v_mov_b32_e32 v4, s3
+; SI-NEXT: v_mov_b32_e32 v5, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s8
; SI-NEXT: v_mov_b32_e32 v3, s9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: .LBB125_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v9, v3
+; SI-NEXT: v_mov_b32_e32 v8, v2
+; SI-NEXT: v_mov_b32_e32 v7, v1
+; SI-NEXT: v_mov_b32_e32 v6, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: v_mov_b32_e32 v2, v6
+; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB125_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8725,26 +8705,26 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_mov_b64 s[4:5], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; VI-NEXT: s_add_u32 s0, s0, s6
+; VI-NEXT: s_addc_u32 s1, s1, s7
; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v6, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v7, s2
; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: .LBB125_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -8761,24 +8741,24 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX9-NEXT: s_add_u32 s0, s0, s4
; GFX9-NEXT: s_addc_u32 s1, s1, s5
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s3
+; GFX9-NEXT: v_mov_b32_e32 v5, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v5, s2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -8807,17 +8787,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: v_mov_b32_e32 v8, s5
+; SI-NEXT: v_mov_b32_e32 v9, s4
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s6
; SI-NEXT: v_mov_b32_e32 v3, s7
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: .LBB126_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v7, v3
; SI-NEXT: v_mov_b32_e32 v6, v2
@@ -8844,68 +8824,68 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
; VI-LABEL: atomic_min_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT: s_mov_b64 s[8:9], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; VI-NEXT: s_add_u32 s0, s0, s6
; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: s_mov_b64 s[6:7], 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v4, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v5, s4
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: .LBB126_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s5
-; VI-NEXT: v_mov_b32_e32 v6, s4
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT: v_mov_b32_e32 v9, v3
+; VI-NEXT: v_mov_b32_e32 v8, v2
+; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; VI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; VI-NEXT: s_cbranch_execnz .LBB126_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[6:7]
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_or_b64 exec, exec, s[8:9]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
; GFX9-NEXT: s_add_u32 s0, s8, s0
; GFX9-NEXT: s_addc_u32 s1, s9, s1
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20
+; GFX9-NEXT: v_mov_b32_e32 v2, s13
+; GFX9-NEXT: v_mov_b32_e32 v3, s12
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[5:6]
-; GFX9-NEXT: v_mov_b32_e32 v0, s13
-; GFX9-NEXT: v_mov_b32_e32 v1, s12
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] offset:32 glc
+; GFX9-NEXT: v_mov_b32_e32 v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, v0
+; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[7:8]
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB126_1
@@ -8927,35 +8907,35 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
-; SI-NEXT: s_mov_b64 s[4:5], 0
-; SI-NEXT: s_mov_b32 s6, s2
-; SI-NEXT: s_mov_b32 s7, s3
-; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: v_mov_b32_e32 v4, s3
+; SI-NEXT: v_mov_b32_e32 v5, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s8
-; SI-NEXT: v_mov_b32_e32 v3, s9
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v2, s4
+; SI-NEXT: v_mov_b32_e32 v3, s5
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: .LBB127_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_mov_b32_e32 v0, s7
-; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
+; SI-NEXT: v_mov_b32_e32 v9, v3
+; SI-NEXT: v_mov_b32_e32 v8, v2
+; SI-NEXT: v_mov_b32_e32 v7, v1
+; SI-NEXT: v_mov_b32_e32 v6, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
-; SI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v2, v6
+; SI-NEXT: v_mov_b32_e32 v3, v7
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB127_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_endpgm
@@ -8966,18 +8946,18 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
; VI-NEXT: s_mov_b64 s[4:5], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: v_mov_b32_e32 v7, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: .LBB127_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -8994,20 +8974,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v4, s3
+; GFX9-NEXT: v_mov_b32_e32 v5, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v5, s2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -9034,17 +9014,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: v_mov_b32_e32 v8, s5
+; SI-NEXT: v_mov_b32_e32 v9, s4
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s6
; SI-NEXT: v_mov_b32_e32 v3, s7
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: .LBB128_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v7, v3
; SI-NEXT: v_mov_b32_e32 v6, v2
@@ -9073,64 +9053,64 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
-; VI-NEXT: s_mov_b64 s[6:7], 0
+; VI-NEXT: s_add_u32 s6, s0, s6
+; VI-NEXT: s_addc_u32 s7, s1, s7
+; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_mov_b64 s[0:1], 0
+; VI-NEXT: v_mov_b32_e32 v4, s5
+; VI-NEXT: v_mov_b32_e32 v5, s4
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: v_mov_b32_e32 v3, s9
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: .LBB128_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s5
-; VI-NEXT: v_mov_b32_e32 v6, s4
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; VI-NEXT: v_mov_b32_e32 v9, v3
+; VI-NEXT: v_mov_b32_e32 v8, v2
+; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB128_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[6:7]
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_or_b64 exec, exec, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
; GFX9-NEXT: s_add_u32 s0, s8, s0
; GFX9-NEXT: s_addc_u32 s1, s9, s1
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, s13
+; GFX9-NEXT: v_mov_b32_e32 v3, s12
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[5:6]
-; GFX9-NEXT: v_mov_b32_e32 v0, s13
-; GFX9-NEXT: v_mov_b32_e32 v1, s12
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] glc
+; GFX9-NEXT: v_mov_b32_e32 v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, v0
+; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[7:8]
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB128_1
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 785114cdbf39ab..77924222919984 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -328,12 +328,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1
-; GFX7LESS-NEXT: v_max_f32_e32 v0, v3, v0
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
@@ -399,14 +400,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_max_f32_e32 v0, v2, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_f32_e32 v4, v1, v1
-; GFX9-NEXT: v_max_f32_e32 v0, v4, v0
+; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
@@ -757,14 +758,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1
-; GFX9-DPP-NEXT: v_max_f32_e32 v0, v6, v0
+; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6
; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
@@ -1366,12 +1367,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1
-; GFX7LESS-NEXT: v_max_f32_e32 v0, v3, v0
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
@@ -1437,14 +1439,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_max_f32_e32 v0, v2, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_f32_e32 v4, v1, v1
-; GFX9-NEXT: v_max_f32_e32 v0, v4, v0
+; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
@@ -1795,14 +1797,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1
-; GFX9-DPP-NEXT: v_max_f32_e32 v0, v6, v0
+; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6
; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
@@ -2404,12 +2406,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1
-; GFX7LESS-NEXT: v_max_f32_e32 v0, v3, v0
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
@@ -2475,14 +2478,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_max_f32_e32 v0, v2, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_f32_e32 v4, v1, v1
-; GFX9-NEXT: v_max_f32_e32 v0, v4, v0
+; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
@@ -2833,14 +2836,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1
-; GFX9-DPP-NEXT: v_max_f32_e32 v0, v6, v0
+; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6
; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
@@ -4051,12 +4054,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX7LESS-NEXT: .LBB7_1: ; %ComputeLoop
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
-; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
@@ -4064,7 +4067,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX7LESS-NEXT: s_cbranch_vccnz .LBB7_1
; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -4080,16 +4083,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0
; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0
; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3]
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[41:42]
; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -4163,20 +4166,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-NEXT: s_movk_i32 s32, 0x800
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: v_mov_b32_e32 v41, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_mov_b64 s[0:1], exec
-; GFX9-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX9-NEXT: .LBB7_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
; GFX9-NEXT: v_readlane_b32 s3, v1, s4
; GFX9-NEXT: v_readlane_b32 s2, v0, s4
-; GFX9-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4188,14 +4191,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX9-NEXT: s_mov_b64 s[46:47], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45]
; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX9-NEXT: s_add_u32 s8, s36, 44
; GFX9-NEXT: s_addc_u32 s9, s37, 0
; GFX9-NEXT: s_getpc_b64 s[0:1]
@@ -4203,19 +4206,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
; GFX9-NEXT: v_mov_b32_e32 v0, 8
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, s44
@@ -4226,8 +4229,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0
+; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47]
@@ -4272,20 +4275,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: v_mov_b32_e32 v41, 0
-; GFX1064-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4297,26 +4300,28 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX1064-NEXT: s_mov_b64 s[46:47], 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45]
; GFX1064-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1064-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX1064-NEXT: s_add_u32 s8, s34, 44
; GFX1064-NEXT: s_addc_u32 s9, s35, 0
; GFX1064-NEXT: s_getpc_b64 s[0:1]
; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0
; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX1064-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: v_mov_b32_e32 v2, s44
+; GFX1064-NEXT: v_mov_b32_e32 v3, s45
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
@@ -4326,18 +4331,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s13, s42
; GFX1064-NEXT: s_mov_b32 s14, s33
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v3, s45
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0
+; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4
; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47]
@@ -4382,20 +4385,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-NEXT: s_movk_i32 s32, 0x400
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: v_mov_b32_e32 v41, 0
-; GFX1032-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4407,25 +4410,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45]
; GFX1032-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX1032-NEXT: s_add_u32 s8, s34, 44
; GFX1032-NEXT: s_addc_u32 s9, s35, 0
; GFX1032-NEXT: s_getpc_b64 s[0:1]
; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0
; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX1032-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: v_mov_b32_e32 v2, s44
+; GFX1032-NEXT: v_mov_b32_e32 v3, s45
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: v_mov_b32_e32 v7, 0
; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
@@ -4435,18 +4440,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s13, s42
; GFX1032-NEXT: s_mov_b32 s14, s33
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT: v_mov_b32_e32 v3, s45
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1032-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0
+; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4
; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46
@@ -4481,14 +4484,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: v_mov_b32_e32 v41, 0
-; GFX1164-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX1164-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: .LBB7_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
-; GFX1164-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
@@ -4497,7 +4500,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4510,16 +4513,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-NEXT: ; %bb.3:
; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX1164-NEXT: s_mov_b64 s[46:47], 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45]
+; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45]
; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
; GFX1164-NEXT: .p2align 6
; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX1164-NEXT: s_add_u32 s8, s34, 44
; GFX1164-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-NEXT: s_getpc_b64 s[0:1]
@@ -4527,7 +4530,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v40
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v2, s44
+; GFX1164-NEXT: v_mov_b32_e32 v3, s45
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: v_mov_b32_e32 v7, 0
; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39]
@@ -4535,19 +4540,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-NEXT: s_mov_b32 s12, s43
; GFX1164-NEXT: s_mov_b32 s13, s42
; GFX1164-NEXT: s_mov_b32 s14, s33
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX1164-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s44
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT: v_mov_b32_e32 v3, s45
+; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off
; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
@@ -4584,14 +4586,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: v_mov_b32_e32 v41, 0
-; GFX1132-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX1132-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-NEXT: .LBB7_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
@@ -4600,7 +4602,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4613,22 +4615,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-NEXT: ; %bb.3:
; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45]
+; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45]
; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
; GFX1132-NEXT: .p2align 6
; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX1132-NEXT: s_add_u32 s8, s34, 44
; GFX1132-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-NEXT: s_getpc_b64 s[0:1]
; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT: v_mov_b32_e32 v31, v40
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: v_mov_b32_e32 v3, s45
; GFX1132-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39]
@@ -4636,16 +4640,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-NEXT: s_mov_b32 s12, s43
; GFX1132-NEXT: s_mov_b32 s13, s42
; GFX1132-NEXT: s_mov_b32 s14, s33
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1132-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
-; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: v_mov_b32_e32 v2, s44
+; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off
+; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8
+; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off
; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
@@ -4684,32 +4688,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2
; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42
; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1
-; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[40:41], v[40:41]
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3]
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0
; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -4730,17 +4731,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42
; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47]
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1
@@ -5093,23 +5094,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1032-DPP-NEXT: s_mov_b32 s46, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45]
; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
@@ -5122,6 +5123,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
@@ -5131,9 +5134,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42]
; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45
@@ -5228,9 +5229,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
@@ -5239,6 +5240,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4]
; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45]
@@ -5246,9 +5248,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: .p2align 6
; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
@@ -5257,6 +5258,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
@@ -5264,13 +5267,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_mov_b32 s12, s43
; GFX1164-DPP-NEXT: s_mov_b32 s13, s42
; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42]
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
@@ -5353,9 +5353,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11]
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1132-DPP-NEXT: s_mov_b32 s46, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -5363,15 +5363,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45]
; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
; GFX1132-DPP-NEXT: .p2align 6
; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
@@ -5379,6 +5379,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
@@ -5386,11 +5387,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_mov_b32 s12, s43
; GFX1132-DPP-NEXT: s_mov_b32 s13, s42
; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42]
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -5807,12 +5807,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX7LESS-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7LESS-NEXT: v_max_f64 v[0:1], v[6:7], v[0:1]
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
@@ -5882,15 +5883,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3]
-; GFX9-NEXT: v_max_f64 v[0:1], v[7:8], v[0:1]
+; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -6067,16 +6068,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1164-NEXT: ; %bb.3:
; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX1164-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3]
+; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f64 v[0:1], v[7:8], v[0:1]
+; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -6137,15 +6138,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1132-NEXT: ; %bb.3:
; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3]
+; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f64 v[0:1], v[7:8], v[0:1]
+; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
@@ -6189,24 +6191,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX7LESS-DPP-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5]
-; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[6:7], v[2:3]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6560,16 +6563,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1]
; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9]
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[11:12], v[6:7]
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1]
; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9]
@@ -6651,15 +6655,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1]
; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9]
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9]
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[11:12], v[6:7]
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1]
; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9]
@@ -7598,12 +7602,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX7LESS-NEXT: .LBB11_1: ; %ComputeLoop
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
-; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
@@ -7611,7 +7615,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX7LESS-NEXT: s_cbranch_vccnz .LBB11_1
; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -7627,16 +7631,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0
; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0
; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3]
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[41:42]
; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -7710,20 +7714,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-NEXT: s_movk_i32 s32, 0x800
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: v_mov_b32_e32 v41, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_mov_b64 s[0:1], exec
-; GFX9-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX9-NEXT: .LBB11_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
; GFX9-NEXT: v_readlane_b32 s3, v1, s4
; GFX9-NEXT: v_readlane_b32 s2, v0, s4
-; GFX9-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7735,14 +7739,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX9-NEXT: s_mov_b64 s[46:47], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45]
; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX9-NEXT: s_add_u32 s8, s36, 44
; GFX9-NEXT: s_addc_u32 s9, s37, 0
; GFX9-NEXT: s_getpc_b64 s[0:1]
@@ -7750,19 +7754,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
; GFX9-NEXT: v_mov_b32_e32 v0, 8
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, s44
@@ -7773,8 +7777,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0
+; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47]
@@ -7819,20 +7823,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: v_mov_b32_e32 v41, 0
-; GFX1064-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7844,26 +7848,28 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX1064-NEXT: s_mov_b64 s[46:47], 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45]
; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1064-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX1064-NEXT: s_add_u32 s8, s34, 44
; GFX1064-NEXT: s_addc_u32 s9, s35, 0
; GFX1064-NEXT: s_getpc_b64 s[0:1]
; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0
; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX1064-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: v_mov_b32_e32 v2, s44
+; GFX1064-NEXT: v_mov_b32_e32 v3, s45
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
@@ -7873,18 +7879,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-NEXT: s_mov_b32 s13, s42
; GFX1064-NEXT: s_mov_b32 s14, s33
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v3, s45
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0
+; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4
; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47]
@@ -7929,20 +7933,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-NEXT: s_movk_i32 s32, 0x400
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: v_mov_b32_e32 v41, 0
-; GFX1032-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7954,25 +7958,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45]
; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX1032-NEXT: s_add_u32 s8, s34, 44
; GFX1032-NEXT: s_addc_u32 s9, s35, 0
; GFX1032-NEXT: s_getpc_b64 s[0:1]
; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0
; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX1032-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: v_mov_b32_e32 v2, s44
+; GFX1032-NEXT: v_mov_b32_e32 v3, s45
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: v_mov_b32_e32 v7, 0
; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
@@ -7982,18 +7988,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-NEXT: s_mov_b32 s13, s42
; GFX1032-NEXT: s_mov_b32 s14, s33
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT: v_mov_b32_e32 v3, s45
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1032-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0
+; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4
; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46
@@ -8028,14 +8032,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: v_mov_b32_e32 v41, 0
-; GFX1164-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX1164-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
-; GFX1164-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
@@ -8044,7 +8048,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8057,16 +8061,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-NEXT: ; %bb.3:
; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX1164-NEXT: s_mov_b64 s[46:47], 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45]
+; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45]
; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
; GFX1164-NEXT: .p2align 6
; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX1164-NEXT: s_add_u32 s8, s34, 44
; GFX1164-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-NEXT: s_getpc_b64 s[0:1]
@@ -8074,7 +8078,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v40
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v2, s44
+; GFX1164-NEXT: v_mov_b32_e32 v3, s45
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: v_mov_b32_e32 v7, 0
; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39]
@@ -8082,19 +8088,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-NEXT: s_mov_b32 s12, s43
; GFX1164-NEXT: s_mov_b32 s13, s42
; GFX1164-NEXT: s_mov_b32 s14, s33
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX1164-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s44
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT: v_mov_b32_e32 v3, s45
+; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off
; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
@@ -8131,14 +8134,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: v_mov_b32_e32 v41, 0
-; GFX1132-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX1132-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
@@ -8147,7 +8150,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5]
+; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8160,22 +8163,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-NEXT: ; %bb.3:
; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45]
+; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45]
; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
; GFX1132-NEXT: .p2align 6
; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX1132-NEXT: s_add_u32 s8, s34, 44
; GFX1132-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-NEXT: s_getpc_b64 s[0:1]
; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT: v_mov_b32_e32 v31, v40
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: v_mov_b32_e32 v3, s45
; GFX1132-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39]
@@ -8183,16 +8188,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-NEXT: s_mov_b32 s12, s43
; GFX1132-NEXT: s_mov_b32 s13, s42
; GFX1132-NEXT: s_mov_b32 s14, s33
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1132-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
-; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: v_mov_b32_e32 v2, s44
+; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off
+; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8
+; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off
; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
@@ -8231,32 +8236,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2
; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42
; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1
-; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[40:41], v[40:41]
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3]
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0
; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -8277,17 +8279,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42
; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47]
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1
@@ -8640,23 +8642,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1032-DPP-NEXT: s_mov_b32 s46, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45]
; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
@@ -8669,6 +8671,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
@@ -8678,9 +8682,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42]
; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45
@@ -8775,9 +8777,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
@@ -8786,6 +8788,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4]
; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45]
@@ -8793,9 +8796,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: .p2align 6
; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
@@ -8804,6 +8806,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
@@ -8811,13 +8815,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_mov_b32 s12, s43
; GFX1164-DPP-NEXT: s_mov_b32 s13, s42
; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42]
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
@@ -8900,9 +8901,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11]
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1132-DPP-NEXT: s_mov_b32 s46, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -8910,15 +8911,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45]
; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
; GFX1132-DPP-NEXT: .p2align 6
; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
@@ -8926,6 +8927,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
@@ -8933,11 +8935,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_mov_b32 s12, s43
; GFX1132-DPP-NEXT: s_mov_b32 s13, s42
; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42]
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 1f521f24449847..cb3291df891af4 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -328,12 +328,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1
-; GFX7LESS-NEXT: v_min_f32_e32 v0, v3, v0
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
@@ -399,14 +400,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_max_f32_e32 v0, v2, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_f32_e32 v4, v1, v1
-; GFX9-NEXT: v_min_f32_e32 v0, v4, v0
+; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT: v_min_f32_e32 v0, v0, v2
; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
@@ -757,14 +758,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1
-; GFX9-DPP-NEXT: v_min_f32_e32 v0, v6, v0
+; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v6
; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
@@ -1366,12 +1367,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1
-; GFX7LESS-NEXT: v_min_f32_e32 v0, v3, v0
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
@@ -1437,14 +1439,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_max_f32_e32 v0, v2, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_f32_e32 v4, v1, v1
-; GFX9-NEXT: v_min_f32_e32 v0, v4, v0
+; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT: v_min_f32_e32 v0, v0, v2
; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
@@ -1795,14 +1797,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1
-; GFX9-DPP-NEXT: v_min_f32_e32 v0, v6, v0
+; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v6
; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
@@ -2404,12 +2406,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1
-; GFX7LESS-NEXT: v_min_f32_e32 v0, v3, v0
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
@@ -2475,14 +2478,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_max_f32_e32 v0, v2, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_f32_e32 v4, v1, v1
-; GFX9-NEXT: v_min_f32_e32 v0, v4, v0
+; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT: v_min_f32_e32 v0, v0, v2
; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
@@ -2833,14 +2836,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1
-; GFX9-DPP-NEXT: v_min_f32_e32 v0, v6, v0
+; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v6
; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
@@ -4051,12 +4054,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX7LESS-NEXT: .LBB7_1: ; %ComputeLoop
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
-; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
@@ -4064,7 +4067,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX7LESS-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX7LESS-NEXT: s_cbranch_vccnz .LBB7_1
; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -4080,16 +4083,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0
; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0
; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3]
+; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], v[41:42]
; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -4163,20 +4166,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-NEXT: s_movk_i32 s32, 0x800
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: v_mov_b32_e32 v41, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_mov_b64 s[0:1], exec
-; GFX9-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX9-NEXT: .LBB7_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
; GFX9-NEXT: v_readlane_b32 s3, v1, s4
; GFX9-NEXT: v_readlane_b32 s2, v0, s4
-; GFX9-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4188,14 +4191,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX9-NEXT: s_mov_b64 s[46:47], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45]
; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX9-NEXT: s_add_u32 s8, s36, 44
; GFX9-NEXT: s_addc_u32 s9, s37, 0
; GFX9-NEXT: s_getpc_b64 s[0:1]
@@ -4203,19 +4206,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
; GFX9-NEXT: v_mov_b32_e32 v0, 8
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, s44
@@ -4226,8 +4229,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0
+; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47]
@@ -4272,20 +4275,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: v_mov_b32_e32 v41, 0
-; GFX1064-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4297,26 +4300,28 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX1064-NEXT: s_mov_b64 s[46:47], 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45]
; GFX1064-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1064-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX1064-NEXT: s_add_u32 s8, s34, 44
; GFX1064-NEXT: s_addc_u32 s9, s35, 0
; GFX1064-NEXT: s_getpc_b64 s[0:1]
; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0
; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX1064-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: v_mov_b32_e32 v2, s44
+; GFX1064-NEXT: v_mov_b32_e32 v3, s45
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
@@ -4326,18 +4331,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s13, s42
; GFX1064-NEXT: s_mov_b32 s14, s33
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v3, s45
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0
+; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4
; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47]
@@ -4382,20 +4385,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-NEXT: s_movk_i32 s32, 0x400
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: v_mov_b32_e32 v41, 0
-; GFX1032-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1032-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4407,25 +4410,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45]
; GFX1032-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX1032-NEXT: s_add_u32 s8, s34, 44
; GFX1032-NEXT: s_addc_u32 s9, s35, 0
; GFX1032-NEXT: s_getpc_b64 s[0:1]
; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0
; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX1032-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: v_mov_b32_e32 v2, s44
+; GFX1032-NEXT: v_mov_b32_e32 v3, s45
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: v_mov_b32_e32 v7, 0
; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
@@ -4435,18 +4440,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s13, s42
; GFX1032-NEXT: s_mov_b32 s14, s33
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT: v_mov_b32_e32 v3, s45
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1032-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0
+; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4
; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46
@@ -4481,14 +4484,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: v_mov_b32_e32 v41, 0
-; GFX1164-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX1164-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: .LBB7_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
-; GFX1164-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
@@ -4497,7 +4500,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX1164-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4510,16 +4513,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-NEXT: ; %bb.3:
; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX1164-NEXT: s_mov_b64 s[46:47], 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45]
+; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45]
; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
; GFX1164-NEXT: .p2align 6
; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX1164-NEXT: s_add_u32 s8, s34, 44
; GFX1164-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-NEXT: s_getpc_b64 s[0:1]
@@ -4527,7 +4530,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v40
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v2, s44
+; GFX1164-NEXT: v_mov_b32_e32 v3, s45
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: v_mov_b32_e32 v7, 0
; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39]
@@ -4535,19 +4540,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-NEXT: s_mov_b32 s12, s43
; GFX1164-NEXT: s_mov_b32 s13, s42
; GFX1164-NEXT: s_mov_b32 s14, s33
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX1164-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s44
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT: v_mov_b32_e32 v3, s45
+; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off
; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
@@ -4584,14 +4586,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: v_mov_b32_e32 v41, 0
-; GFX1132-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX1132-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-NEXT: .LBB7_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
@@ -4600,7 +4602,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX1132-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4613,22 +4615,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-NEXT: ; %bb.3:
; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45]
+; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45]
; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
; GFX1132-NEXT: .p2align 6
; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX1132-NEXT: s_add_u32 s8, s34, 44
; GFX1132-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-NEXT: s_getpc_b64 s[0:1]
; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT: v_mov_b32_e32 v31, v40
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: v_mov_b32_e32 v3, s45
; GFX1132-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39]
@@ -4636,16 +4640,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-NEXT: s_mov_b32 s12, s43
; GFX1132-NEXT: s_mov_b32 s13, s42
; GFX1132-NEXT: s_mov_b32 s14, s33
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1132-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
-; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: v_mov_b32_e32 v2, s44
+; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off
+; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8
+; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off
; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
@@ -4684,32 +4688,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2
; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42
; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1
-; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[40:41], v[40:41]
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3]
+; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0
; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -4730,17 +4731,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42
; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47]
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1
@@ -5093,23 +5094,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1032-DPP-NEXT: s_mov_b32 s46, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45]
; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
@@ -5122,6 +5123,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
@@ -5131,9 +5134,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42]
; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45
@@ -5228,9 +5229,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
@@ -5239,6 +5240,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4]
; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45]
@@ -5246,9 +5248,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: .p2align 6
; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
@@ -5257,6 +5258,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
@@ -5264,13 +5267,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_mov_b32 s12, s43
; GFX1164-DPP-NEXT: s_mov_b32 s13, s42
; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42]
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
@@ -5353,9 +5353,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11]
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1132-DPP-NEXT: s_mov_b32 s46, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -5363,15 +5363,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45]
; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
; GFX1132-DPP-NEXT: .p2align 6
; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
@@ -5379,6 +5379,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
@@ -5386,11 +5387,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_mov_b32 s12, s43
; GFX1132-DPP-NEXT: s_mov_b32 s13, s42
; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42]
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -5807,12 +5807,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX7LESS-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7LESS-NEXT: v_min_f64 v[0:1], v[6:7], v[0:1]
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
@@ -5882,15 +5883,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3]
-; GFX9-NEXT: v_min_f64 v[0:1], v[7:8], v[0:1]
+; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -6067,16 +6068,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1164-NEXT: ; %bb.3:
; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX1164-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3]
+; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_min_f64 v[0:1], v[7:8], v[0:1]
+; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -6137,15 +6138,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1132-NEXT: ; %bb.3:
; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3]
+; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_min_f64 v[0:1], v[7:8], v[0:1]
+; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
@@ -6189,24 +6191,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX7LESS-DPP-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5]
-; GFX7LESS-DPP-NEXT: v_min_f64 v[2:3], v[6:7], v[2:3]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6560,16 +6563,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1]
; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9]
+; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_min_f64 v[6:7], v[11:12], v[6:7]
+; GFX1164-DPP-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1]
; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9]
@@ -6651,15 +6655,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1]
; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9]
+; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9]
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_min_f64 v[6:7], v[11:12], v[6:7]
+; GFX1132-DPP-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1]
; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9]
@@ -7598,12 +7602,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX7LESS-NEXT: .LBB11_1: ; %ComputeLoop
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
-; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
@@ -7611,7 +7615,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX7LESS-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX7LESS-NEXT: s_cbranch_vccnz .LBB11_1
; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -7627,16 +7631,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0
; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0
+; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0
; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3]
+; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], v[41:42]
; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -7710,20 +7714,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-NEXT: s_movk_i32 s32, 0x800
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: v_mov_b32_e32 v41, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_mov_b64 s[0:1], exec
-; GFX9-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX9-NEXT: .LBB11_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
; GFX9-NEXT: v_readlane_b32 s3, v1, s4
; GFX9-NEXT: v_readlane_b32 s2, v0, s4
-; GFX9-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7735,14 +7739,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX9-NEXT: s_mov_b64 s[46:47], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45]
; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX9-NEXT: s_add_u32 s8, s36, 44
; GFX9-NEXT: s_addc_u32 s9, s37, 0
; GFX9-NEXT: s_getpc_b64 s[0:1]
@@ -7750,19 +7754,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
-; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
; GFX9-NEXT: v_mov_b32_e32 v0, 8
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, s44
@@ -7773,8 +7777,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0
+; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47]
@@ -7819,20 +7823,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: v_mov_b32_e32 v41, 0
-; GFX1064-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7844,26 +7848,28 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX1064-NEXT: s_mov_b64 s[46:47], 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45]
; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1064-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX1064-NEXT: s_add_u32 s8, s34, 44
; GFX1064-NEXT: s_addc_u32 s9, s35, 0
; GFX1064-NEXT: s_getpc_b64 s[0:1]
; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0
; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX1064-NEXT: v_mov_b32_e32 v31, v40
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: v_mov_b32_e32 v2, s44
+; GFX1064-NEXT: v_mov_b32_e32 v3, s45
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
@@ -7873,18 +7879,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-NEXT: s_mov_b32 s13, s42
; GFX1064-NEXT: s_mov_b32 s14, s33
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-NEXT: v_mov_b32_e32 v5, 8
-; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v3, s45
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0
+; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4
; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47]
@@ -7929,20 +7933,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-NEXT: s_movk_i32 s32, 0x400
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: v_mov_b32_e32 v41, 0
-; GFX1032-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1032-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7954,25 +7958,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45]
; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX1032-NEXT: s_add_u32 s8, s34, 44
; GFX1032-NEXT: s_addc_u32 s9, s35, 0
; GFX1032-NEXT: s_getpc_b64 s[0:1]
; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0
; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX1032-NEXT: v_mov_b32_e32 v31, v40
-; GFX1032-NEXT: v_mov_b32_e32 v0, 8
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: v_mov_b32_e32 v2, s44
+; GFX1032-NEXT: v_mov_b32_e32 v3, s45
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: v_mov_b32_e32 v7, 0
; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
@@ -7982,18 +7988,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-NEXT: s_mov_b32 s13, s42
; GFX1032-NEXT: s_mov_b32 s14, s33
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT: v_mov_b32_e32 v3, s45
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1032-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0
+; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4
; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46
@@ -8028,14 +8032,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: v_mov_b32_e32 v41, 0
-; GFX1164-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX1164-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
-; GFX1164-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
@@ -8044,7 +8048,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX1164-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8057,16 +8061,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-NEXT: ; %bb.3:
; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX1164-NEXT: s_mov_b64 s[46:47], 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45]
+; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45]
; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
; GFX1164-NEXT: .p2align 6
; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX1164-NEXT: s_add_u32 s8, s34, 44
; GFX1164-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-NEXT: s_getpc_b64 s[0:1]
@@ -8074,7 +8078,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX1164-NEXT: v_mov_b32_e32 v31, v40
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v2, s44
+; GFX1164-NEXT: v_mov_b32_e32 v3, s45
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: v_mov_b32_e32 v7, 0
; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39]
@@ -8082,19 +8088,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-NEXT: s_mov_b32 s12, s43
; GFX1164-NEXT: s_mov_b32 s13, s42
; GFX1164-NEXT: s_mov_b32 s14, s33
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX1164-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_mov_b32_e32 v2, s44
-; GFX1164-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT: v_mov_b32_e32 v3, s45
+; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off
; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
@@ -8131,14 +8134,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: v_mov_b32_e32 v41, 0
-; GFX1132-NEXT: v_mov_b32_e32 v42, 0x7ff80000
+; GFX1132-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42]
+; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
@@ -8147,7 +8150,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5]
+; GFX1132-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8160,22 +8163,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-NEXT: ; %bb.3:
; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45]
+; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45]
; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
; GFX1132-NEXT: .p2align 6
; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
; GFX1132-NEXT: s_add_u32 s8, s34, 44
; GFX1132-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-NEXT: s_getpc_b64 s[0:1]
; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT: v_mov_b32_e32 v31, v40
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: v_mov_b32_e32 v3, s45
; GFX1132-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39]
@@ -8183,16 +8188,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-NEXT: s_mov_b32 s12, s43
; GFX1132-NEXT: s_mov_b32 s13, s42
; GFX1132-NEXT: s_mov_b32 s14, s33
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1132-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
-; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: v_mov_b32_e32 v2, s44
+; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off
+; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8
+; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off
; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
@@ -8231,32 +8236,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2
+; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2
; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42
; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1
-; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0
+; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[40:41], v[40:41]
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3]
+; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0
; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1]
; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
@@ -8277,17 +8279,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43
; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42
; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47]
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1
@@ -8640,23 +8642,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1032-DPP-NEXT: s_mov_b32 s46, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45]
; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
@@ -8669,6 +8671,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
@@ -8678,9 +8682,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42]
; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45
@@ -8775,9 +8777,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
@@ -8786,6 +8788,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4]
; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45]
@@ -8793,9 +8796,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: .p2align 6
; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
@@ -8804,6 +8806,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
@@ -8811,13 +8815,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_mov_b32 s12, s43
; GFX1164-DPP-NEXT: s_mov_b32 s13, s42
; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42]
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
@@ -8900,9 +8901,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11]
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1132-DPP-NEXT: s_mov_b32 s46, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -8910,15 +8911,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45]
; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
; GFX1132-DPP-NEXT: .p2align 6
; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42]
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2]
+; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
@@ -8926,6 +8927,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
@@ -8933,11 +8935,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_mov_b32 s12, s43
; GFX1132-DPP-NEXT: s_mov_b32 s13, s42
; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42]
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index f1d58814f9f04a..5c5a769178dd94 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -54,18 +54,18 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_mov_b64 s[16:17], s[4:5]
; GFX11-NEXT: v_mov_b32_e32 v31, v0
-; GFX11-NEXT: s_load_b32 s24, s[16:17], 0x24
+; GFX11-NEXT: s_load_b32 s19, s[16:17], 0x24
; GFX11-NEXT: s_mov_b32 s12, s13
; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v31
; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1]
-; GFX11-NEXT: s_mov_b32 s19, 0
+; GFX11-NEXT: s_mov_b32 s20, 0
; GFX11-NEXT: s_mov_b32 s0, -1
; GFX11-NEXT: s_mov_b32 s3, exec_lo
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mul_lo_u32 v0, s24, v0
+; GFX11-NEXT: v_mul_lo_u32 v0, s19, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB2_13
@@ -74,7 +74,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: s_mov_b32 s18, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_bitcmp1_b32 s21, 0
-; GFX11-NEXT: s_cselect_b32 s19, -1, 0
+; GFX11-NEXT: s_cselect_b32 s24, -1, 0
; GFX11-NEXT: s_bitcmp0_b32 s21, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB2_3
; GFX11-NEXT: ; %bb.2: ; %bb15
@@ -110,60 +110,58 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: s_cbranch_scc0 .LBB2_8
; GFX11-NEXT: ; %bb.5: ; %bb18.preheader
; GFX11-NEXT: s_load_b128 s[28:31], s[16:17], 0x44
-; GFX11-NEXT: s_mov_b32 vcc_lo, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mul_hi_u32 s0, s29, s28
; GFX11-NEXT: s_mul_i32 s1, s29, s28
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1
; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_mov_b32 s13, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, 1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_lshr_b32 s0, s0, s30
-; GFX11-NEXT: s_mul_i32 s0, s0, s22
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_mul_i32 s0, s0, s22
; GFX11-NEXT: s_mul_i32 s0, s0, s20
-; GFX11-NEXT: s_or_b32 s0, s24, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s19, s0
; GFX11-NEXT: s_lshl_b64 s[20:21], s[0:1], 1
-; GFX11-NEXT: global_load_u16 v0, v0, s[20:21]
+; GFX11-NEXT: s_mov_b32 s0, s1
+; GFX11-NEXT: global_load_u16 v1, v0, s[20:21]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s24
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT: s_mov_b32 vcc_lo, 0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB2_6: ; %bb18
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s19
+; GFX11-NEXT: v_readfirstlane_b32 s13, v0
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX11-NEXT: s_cselect_b32 s1, -1, 0
-; GFX11-NEXT: v_readfirstlane_b32 s20, v0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
; GFX11-NEXT: s_and_b32 s1, s8, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_and_b32 s1, s1, exec_lo
-; GFX11-NEXT: v_readfirstlane_b32 s21, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: s_cselect_b32 s1, s21, s20
-; GFX11-NEXT: s_and_b32 s20, 0xffff, s13
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_readfirstlane_b32 s19, v2
+; GFX11-NEXT: s_cselect_b32 s1, s19, s13
+; GFX11-NEXT: s_and_b32 s13, 0xffff, s0
; GFX11-NEXT: s_and_b32 s1, s1, 1
-; GFX11-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-NEXT: s_cselect_b32 s20, -1, 0
-; GFX11-NEXT: s_and_b32 s22, s9, exec_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s20
-; GFX11-NEXT: v_readfirstlane_b32 s20, v1
+; GFX11-NEXT: s_cmp_lg_u32 s13, 0
+; GFX11-NEXT: s_cselect_b32 s13, -1, 0
+; GFX11-NEXT: s_and_b32 s20, s9, exec_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13
+; GFX11-NEXT: v_readfirstlane_b32 s13, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_readfirstlane_b32 s21, v0
-; GFX11-NEXT: s_cselect_b32 s20, s21, s20
+; GFX11-NEXT: v_readfirstlane_b32 s19, v2
+; GFX11-NEXT: s_cselect_b32 s13, s19, s13
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_bitcmp1_b32 s20, 0
-; GFX11-NEXT: s_cselect_b32 s20, 0x100, 0
-; GFX11-NEXT: s_or_b32 s13, s20, s13
+; GFX11-NEXT: s_bitcmp1_b32 s13, 0
+; GFX11-NEXT: s_cselect_b32 s13, 0x100, 0
+; GFX11-NEXT: s_or_b32 s0, s13, s0
; GFX11-NEXT: s_cbranch_vccz .LBB2_6
; GFX11-NEXT: ; %bb.7: ; %Flow
; GFX11-NEXT: s_mov_b32 s0, 0
@@ -181,7 +179,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: ; %bb.11: ; %Flow6
; GFX11-NEXT: s_mov_b32 s18, -1
; GFX11-NEXT: .LBB2_12: ; %Flow11
-; GFX11-NEXT: s_and_b32 s19, s2, exec_lo
+; GFX11-NEXT: s_and_b32 s20, s2, exec_lo
; GFX11-NEXT: s_or_not1_b32 s0, s18, exec_lo
; GFX11-NEXT: .LBB2_13: ; %Flow9
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3
@@ -198,10 +196,10 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: s_mov_b32 s14, s15
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_or_b32 s19, s19, exec_lo
+; GFX11-NEXT: s_or_b32 s20, s20, exec_lo
; GFX11-NEXT: .LBB2_15: ; %Flow14
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11-NEXT: s_and_saveexec_b32 s0, s19
+; GFX11-NEXT: s_and_saveexec_b32 s0, s20
; GFX11-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock
; GFX11-NEXT: ; divergent unreachable
; GFX11-NEXT: ; %bb.17: ; %UnifiedReturnBlock
diff --git a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir
index 67cdf196a46937..dd478f94e1039e 100644
--- a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir
+++ b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir
@@ -32,38 +32,38 @@ body: |
; GCN-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: bb.1:
- ; GCN-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
- ; GCN-NEXT: liveins: $vcc
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
; GCN-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY]], implicit $mode, implicit $exec
- ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_]], implicit $exec
; GCN-NEXT: [[V_CVT_F64_I32_e32_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY1]], implicit $mode, implicit $exec
- ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_1]], implicit $exec
; GCN-NEXT: [[V_CVT_F64_I32_e32_2:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY2]], implicit $mode, implicit $exec
- ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_2]], implicit $exec
; GCN-NEXT: [[V_CVT_F64_I32_e32_3:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY3]], implicit $mode, implicit $exec
- ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_3]], implicit $exec
; GCN-NEXT: [[V_CVT_F64_I32_e32_4:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY4]], implicit $mode, implicit $exec
- ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_4]], implicit $exec
; GCN-NEXT: [[V_CVT_F64_I32_e32_5:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY5]], implicit $mode, implicit $exec
- ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_5]], implicit $exec
; GCN-NEXT: [[V_CVT_F64_I32_e32_6:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY6]], implicit $mode, implicit $exec
- ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_6]], implicit $exec
; GCN-NEXT: [[V_CVT_F64_I32_e32_7:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY7]], implicit $mode, implicit $exec
- ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_7]], implicit $exec
; GCN-NEXT: [[V_CVT_F64_I32_e32_8:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY8]], implicit $mode, implicit $exec
- ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_8]], implicit $exec
; GCN-NEXT: [[V_CVT_F64_I32_e32_9:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY9]], implicit $mode, implicit $exec
- ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_9]], implicit $exec
; GCN-NEXT: [[V_CVT_F64_I32_e32_10:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY10]], implicit $mode, implicit $exec
- ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_10]], implicit $exec
; GCN-NEXT: [[V_CVT_F64_I32_e32_11:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY11]], implicit $mode, implicit $exec
- ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_11]], implicit $exec
; GCN-NEXT: [[V_CVT_F64_I32_e32_12:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY12]], implicit $mode, implicit $exec
- ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_12]], implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+ ; GCN-NEXT: liveins: $vcc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
+ ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_]], implicit $exec
+ ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_1]], implicit $exec
+ ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_2]], implicit $exec
+ ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_3]], implicit $exec
+ ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_4]], implicit $exec
+ ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_5]], implicit $exec
+ ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_6]], implicit $exec
+ ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_7]], implicit $exec
+ ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_8]], implicit $exec
+ ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_9]], implicit $exec
+ ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_10]], implicit $exec
+ ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_11]], implicit $exec
+ ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_12]], implicit $exec
; GCN-NEXT: [[V_CVT_F64_I32_e32_13:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY13]], implicit $mode, implicit $exec
; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_13]], implicit $exec
; GCN-NEXT: [[V_CVT_F64_I32_e32_14:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY14]], implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
index b079637103fa4f..eb2d95e4db2d5f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
@@ -6,20 +6,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32(<4 x i32> %addr, i32 %i
; CHECK-LABEL: struct_atomic_buffer_load_i32:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: .LBB0_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB0_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
@@ -67,20 +67,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_off(<4 x i32> %addr, i3
; CHECK-LABEL: struct_atomic_buffer_load_i32_off:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: .LBB2_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB2_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
@@ -99,20 +99,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_soff(<4 x i32> %addr, i
; CHECK-LABEL: struct_atomic_buffer_load_i32_soff:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: .LBB3_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 4 idxen offset:4 glc
+; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB3_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
@@ -130,20 +130,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_dlc(<4 x i32> %addr, i3
; CHECK-LABEL: struct_atomic_buffer_load_i32_dlc:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: .LBB4_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 dlc
+; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB4_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
@@ -194,20 +194,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %i
; CHECK-LABEL: struct_atomic_buffer_load_i64:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34
; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v2, s6
; CHECK-NEXT: .LBB6_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v2, s4
-; CHECK-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT: buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
-; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1]
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB6_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
@@ -227,20 +227,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v2i16(<4 x i32> %addr, i32
; CHECK-LABEL: struct_atomic_buffer_load_v2i16:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: .LBB7_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB7_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
@@ -260,23 +260,23 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32
; CHECK-LABEL: struct_atomic_buffer_load_v4i16:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: .LBB8_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; CHECK-NEXT: v_and_b32_e32 v2, 0xffff, v2
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB8_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
@@ -297,20 +297,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i32(<4 x i32> %addr, i32
; CHECK-LABEL: struct_atomic_buffer_load_v4i32:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: .LBB9_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: buffer_load_b128 v[1:4], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0
-; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB9_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
@@ -330,22 +330,22 @@ define amdgpu_kernel void @struct_atomic_buffer_load_ptr(<4 x i32> %addr, i32 %i
; CHECK-LABEL: struct_atomic_buffer_load_ptr:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: .LBB10_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_load_b32 v1, v[1:2]
+; CHECK-NEXT: flat_load_b32 v2, v[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB10_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
index e056f05aea4514..bc50b12b590496 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
@@ -6,20 +6,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32(ptr addrspace(8) %p
; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: .LBB0_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB0_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
@@ -67,20 +67,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_off(ptr addrspace(8
; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_off:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: .LBB2_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB2_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
@@ -99,20 +99,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_soff(ptr addrspace(
; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_soff:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: .LBB3_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 4 idxen offset:4 glc
+; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB3_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
@@ -130,20 +130,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8
; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_dlc:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: .LBB4_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 dlc
+; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB4_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
@@ -194,20 +194,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i64(ptr addrspace(8) %p
; CHECK-LABEL: struct_ptr_atomic_buffer_load_i64:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34
; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v2, s6
; CHECK-NEXT: .LBB6_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v2, s4
-; CHECK-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT: buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
-; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1]
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB6_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
@@ -227,20 +227,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v2i16(ptr addrspace(8)
; CHECK-LABEL: struct_ptr_atomic_buffer_load_v2i16:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: .LBB7_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc
+; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB7_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
@@ -260,23 +260,23 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8)
; CHECK-LABEL: struct_ptr_atomic_buffer_load_v4i16:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: .LBB8_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; CHECK-NEXT: v_and_b32_e32 v2, 0xffff, v2
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB8_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
@@ -297,20 +297,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i32(ptr addrspace(8)
; CHECK-LABEL: struct_ptr_atomic_buffer_load_v4i32:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: .LBB9_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: buffer_load_b128 v[1:4], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0
-; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB9_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
@@ -330,22 +330,22 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %p
; CHECK-LABEL: struct_ptr_atomic_buffer_load_ptr:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: .LBB10_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_load_b32 v1, v[1:2]
+; CHECK-NEXT: flat_load_b32 v2, v[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB10_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index f961b7d9d52239..23b57a7efa586c 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -5818,38 +5818,39 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: ds_load_b32 v2, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: v_dual_add_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v2, v5, v2
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -5864,33 +5865,33 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: ds_read_b32 v2, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT: v_add_f32_e32 v2, v5, v2
-; GFX10-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT: v_mov_b32_e32 v4, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4
-; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB24_1
@@ -5904,29 +5905,29 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ds_read_b32 v2, v0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX90A-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX90A-NEXT: v_add_f32_e32 v2, v6, v2
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9
; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5943,29 +5944,29 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: ds_read_b32 v2, v0
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_add_f32_e32 v2, v6, v2
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9
; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5983,30 +5984,30 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v2, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_add_f32_e32 v2, v6, v2
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6124,38 +6125,39 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: v_dual_add_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v2, v5, v2
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6170,33 +6172,33 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT: v_add_f32_e32 v2, v5, v2
-; GFX10-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT: v_mov_b32_e32 v4, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4
-; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB25_1
@@ -6210,29 +6212,29 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX90A-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX90A-NEXT: v_add_f32_e32 v2, v6, v2
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9
; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6249,29 +6251,29 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_add_f32_e32 v2, v6, v2
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9
; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6289,30 +6291,30 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_add_f32_e32 v2, v6, v2
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6430,39 +6432,39 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-LABEL: local_atomic_fadd_noret_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v2, v0
+; GFX11-NEXT: ds_load_b32 v3, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_add_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_add_f32_e32 v4, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v4
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6475,34 +6477,34 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX10-LABEL: local_atomic_fadd_noret_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v2, v0
+; GFX10-NEXT: ds_read_b32 v3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX10-NEXT: v_add_f32_e32 v4, v6, v5
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v1
; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB26_1
@@ -6513,35 +6515,35 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX90A-LABEL: local_atomic_fadd_noret_v2bf16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0
+; GFX90A-NEXT: ds_read_b32 v3, v0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_add_f32_e32 v4, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6551,35 +6553,35 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX908-LABEL: local_atomic_fadd_noret_v2bf16:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b32 v2, v0
+; GFX908-NEXT: ds_read_b32 v3, v0
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX908-NEXT: v_add_f32_e32 v4, v6, v5
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB26_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6590,36 +6592,36 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v2, v0
+; GFX8-NEXT: ds_read_b32 v3, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_add_f32_e32 v4, v6, v5
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB26_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6727,39 +6729,39 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-LABEL: local_atomic_fadd_noret_v2bf16__ofset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_add_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_add_f32_e32 v4, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v4
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6772,34 +6774,34 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX10-LABEL: local_atomic_fadd_noret_v2bf16__ofset:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT: ds_read_b32 v3, v0 offset:65532
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX10-NEXT: v_add_f32_e32 v4, v6, v5
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v1
; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB27_1
@@ -6810,35 +6812,35 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX90A-LABEL: local_atomic_fadd_noret_v2bf16__ofset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_add_f32_e32 v4, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6848,35 +6850,35 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX908-LABEL: local_atomic_fadd_noret_v2bf16__ofset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX908-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX908-NEXT: v_add_f32_e32 v4, v6, v5
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB27_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6887,36 +6889,36 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_add_f32_e32 v4, v6, v5
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB27_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7628,22 +7630,21 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX7-NEXT: s_cbranch_execz .LBB28_4
; GFX7-NEXT: ; %bb.1:
-; GFX7-NEXT: s_lshl_b32 s10, s3, 3
-; GFX7-NEXT: v_mov_b32_e32 v1, s10
-; GFX7-NEXT: ds_read_b32 v1, v1
+; GFX7-NEXT: s_lshl_b32 s8, s3, 3
+; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: ds_read_b32 v1, v2
; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0
-; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2
+; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0
+; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB28_2: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v1, s10
-; GFX7-NEXT: v_add_f32_e32 v4, v3, v2
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v1, v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v1
+; GFX7-NEXT: v_add_f32_e32 v1, v4, v3
+; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4
; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB28_2
@@ -7659,23 +7660,22 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1]
; GFX7-NEXT: s_cbranch_execz .LBB28_7
; GFX7-NEXT: ; %bb.5:
-; GFX7-NEXT: s_lshl_b32 s3, s3, 4
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: ds_read_b32 v2, v1
+; GFX7-NEXT: s_lshl_b32 s0, s3, 4
+; GFX7-NEXT: v_mov_b32_e32 v1, s0
+; GFX7-NEXT: ds_read_b32 v3, v1
; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9]
-; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v1, s0
-; GFX7-NEXT: v_mul_f32_e32 v1, 0x42280000, v1
+; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0
+; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB28_6: ; %atomicrmw.start2
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v2, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, s3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3
+; GFX7-NEXT: v_add_f32_e32 v4, v3, v2
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v2
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3
; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v2, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB28_6
; GFX7-NEXT: .LBB28_7: ; %Flow22
@@ -7710,24 +7710,23 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX7-NEXT: s_xor_b64 s[6:7], exec, s[0:1]
; GFX7-NEXT: s_cbranch_execz .LBB28_13
; GFX7-NEXT: ; %bb.10:
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-NEXT: v_mov_b32_e32 v3, s2
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b32 v2, v2
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: ds_read_b32 v2, v3
+; GFX7-NEXT: s_mov_b64 s[2:3], 0
; GFX7-NEXT: .LBB28_11: ; %atomicrmw.start8
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_add_f32_e32 v4, v3, v1
-; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v2, v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: v_add_f32_e32 v2, v4, v1
+; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3
-; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4
+; GFX7-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX7-NEXT: s_cbranch_execnz .LBB28_11
; GFX7-NEXT: ; %bb.12: ; %Flow
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX7-NEXT: .LBB28_13: ; %Flow20
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -7755,22 +7754,21 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX6-NEXT: s_cbranch_execz .LBB28_4
; GFX6-NEXT: ; %bb.1:
-; GFX6-NEXT: s_lshl_b32 s10, s3, 3
-; GFX6-NEXT: v_mov_b32_e32 v1, s10
-; GFX6-NEXT: ds_read_b32 v1, v1
+; GFX6-NEXT: s_lshl_b32 s8, s3, 3
+; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: ds_read_b32 v1, v2
; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0
-; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2
+; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0
+; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB28_2: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_mov_b32_e32 v1, s10
-; GFX6-NEXT: v_add_f32_e32 v4, v3, v2
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v1, v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v1
+; GFX6-NEXT: v_add_f32_e32 v1, v4, v3
+; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4
; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB28_2
@@ -7786,23 +7784,22 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[0:1]
; GFX6-NEXT: s_cbranch_execz .LBB28_7
; GFX6-NEXT: ; %bb.5:
-; GFX6-NEXT: s_lshl_b32 s3, s3, 4
-; GFX6-NEXT: v_mov_b32_e32 v1, s3
-; GFX6-NEXT: ds_read_b32 v2, v1
+; GFX6-NEXT: s_lshl_b32 s0, s3, 4
+; GFX6-NEXT: v_mov_b32_e32 v1, s0
+; GFX6-NEXT: ds_read_b32 v3, v1
; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9]
-; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v1, s0
-; GFX6-NEXT: v_mul_f32_e32 v1, 0x42280000, v1
+; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0
+; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB28_6: ; %atomicrmw.start2
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v2, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, s3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3
+; GFX6-NEXT: v_add_f32_e32 v4, v3, v2
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v2
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3
; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v2, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB28_6
; GFX6-NEXT: .LBB28_7: ; %Flow20
@@ -7837,24 +7834,23 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX6-NEXT: s_xor_b64 s[6:7], exec, s[0:1]
; GFX6-NEXT: s_cbranch_execz .LBB28_13
; GFX6-NEXT: ; %bb.10:
-; GFX6-NEXT: v_mov_b32_e32 v2, s2
+; GFX6-NEXT: v_mov_b32_e32 v3, s2
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v2, v2
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: ds_read_b32 v2, v3
+; GFX6-NEXT: s_mov_b64 s[2:3], 0
; GFX6-NEXT: .LBB28_11: ; %atomicrmw.start8
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v2, s2
-; GFX6-NEXT: v_add_f32_e32 v4, v3, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v2, v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: v_add_f32_e32 v2, v4, v1
+; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3
-; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4
+; GFX6-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX6-NEXT: s_cbranch_execnz .LBB28_11
; GFX6-NEXT: ; %bb.12: ; %Flow
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX6-NEXT: .LBB28_13: ; %Flow18
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -8471,22 +8467,21 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX7-NEXT: s_cbranch_execz .LBB29_4
; GFX7-NEXT: ; %bb.1:
-; GFX7-NEXT: s_lshl_b32 s10, s3, 3
-; GFX7-NEXT: v_mov_b32_e32 v1, s10
-; GFX7-NEXT: ds_read_b32 v1, v1
+; GFX7-NEXT: s_lshl_b32 s8, s3, 3
+; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: ds_read_b32 v1, v2
; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0
-; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2
+; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0
+; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB29_2: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v1, s10
-; GFX7-NEXT: v_add_f32_e32 v4, v3, v2
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v1, v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v1
+; GFX7-NEXT: v_add_f32_e32 v1, v4, v3
+; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4
; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB29_2
@@ -8502,23 +8497,22 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1]
; GFX7-NEXT: s_cbranch_execz .LBB29_7
; GFX7-NEXT: ; %bb.5:
-; GFX7-NEXT: s_lshl_b32 s3, s3, 4
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: ds_read_b32 v2, v1
+; GFX7-NEXT: s_lshl_b32 s0, s3, 4
+; GFX7-NEXT: v_mov_b32_e32 v1, s0
+; GFX7-NEXT: ds_read_b32 v3, v1
; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9]
-; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v1, s0
-; GFX7-NEXT: v_mul_f32_e32 v1, 0x42280000, v1
+; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0
+; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB29_6: ; %atomicrmw.start2
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v2, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, s3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3
+; GFX7-NEXT: v_add_f32_e32 v4, v3, v2
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v2
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3
; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v2, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB29_6
; GFX7-NEXT: .LBB29_7: ; %Flow22
@@ -8553,24 +8547,23 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX7-NEXT: s_xor_b64 s[6:7], exec, s[0:1]
; GFX7-NEXT: s_cbranch_execz .LBB29_13
; GFX7-NEXT: ; %bb.10:
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-NEXT: v_mov_b32_e32 v3, s2
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b32 v2, v2
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: ds_read_b32 v2, v3
+; GFX7-NEXT: s_mov_b64 s[2:3], 0
; GFX7-NEXT: .LBB29_11: ; %atomicrmw.start8
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_add_f32_e32 v4, v3, v1
-; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v2, v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: v_add_f32_e32 v2, v4, v1
+; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3
-; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4
+; GFX7-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX7-NEXT: s_cbranch_execnz .LBB29_11
; GFX7-NEXT: ; %bb.12: ; %Flow
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX7-NEXT: .LBB29_13: ; %Flow20
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -8598,22 +8591,21 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX6-NEXT: s_cbranch_execz .LBB29_4
; GFX6-NEXT: ; %bb.1:
-; GFX6-NEXT: s_lshl_b32 s10, s3, 3
-; GFX6-NEXT: v_mov_b32_e32 v1, s10
-; GFX6-NEXT: ds_read_b32 v1, v1
+; GFX6-NEXT: s_lshl_b32 s8, s3, 3
+; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: ds_read_b32 v1, v2
; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0
-; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2
+; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0
+; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB29_2: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_mov_b32_e32 v1, s10
-; GFX6-NEXT: v_add_f32_e32 v4, v3, v2
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v1, v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v1
+; GFX6-NEXT: v_add_f32_e32 v1, v4, v3
+; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4
; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB29_2
@@ -8629,23 +8621,22 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[0:1]
; GFX6-NEXT: s_cbranch_execz .LBB29_7
; GFX6-NEXT: ; %bb.5:
-; GFX6-NEXT: s_lshl_b32 s3, s3, 4
-; GFX6-NEXT: v_mov_b32_e32 v1, s3
-; GFX6-NEXT: ds_read_b32 v2, v1
+; GFX6-NEXT: s_lshl_b32 s0, s3, 4
+; GFX6-NEXT: v_mov_b32_e32 v1, s0
+; GFX6-NEXT: ds_read_b32 v3, v1
; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9]
-; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v1, s0
-; GFX6-NEXT: v_mul_f32_e32 v1, 0x42280000, v1
+; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0
+; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB29_6: ; %atomicrmw.start2
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v2, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, s3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3
+; GFX6-NEXT: v_add_f32_e32 v4, v3, v2
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v2
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3
; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v2, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB29_6
; GFX6-NEXT: .LBB29_7: ; %Flow20
@@ -8680,24 +8671,23 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX6-NEXT: s_xor_b64 s[6:7], exec, s[0:1]
; GFX6-NEXT: s_cbranch_execz .LBB29_13
; GFX6-NEXT: ; %bb.10:
-; GFX6-NEXT: v_mov_b32_e32 v2, s2
+; GFX6-NEXT: v_mov_b32_e32 v3, s2
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v2, v2
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: ds_read_b32 v2, v3
+; GFX6-NEXT: s_mov_b64 s[2:3], 0
; GFX6-NEXT: .LBB29_11: ; %atomicrmw.start8
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v2, s2
-; GFX6-NEXT: v_add_f32_e32 v4, v3, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v2, v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: v_add_f32_e32 v2, v4, v1
+; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3
-; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4
+; GFX6-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX6-NEXT: s_cbranch_execnz .LBB29_11
; GFX6-NEXT: ; %bb.12: ; %Flow
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX6-NEXT: .LBB29_13: ; %Flow18
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
index 47058de71e7f4c..d419b0cdfdd1ab 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
@@ -4520,15 +4520,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ds_load_b32 v2, v0
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v4, v3, v3
-; GFX12-NEXT: v_pk_max_num_f16 v2, v4, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -4550,17 +4550,17 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ds_read_b32 v2, v0
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v1, v1, v1
; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_pk_max_f16 v3, v1, v1
-; GFX940-NEXT: v_pk_max_f16 v2, v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_max_f16 v2, v2, v3
-; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v1
+; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB20_1
@@ -4573,15 +4573,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: ds_load_b32 v2, v0
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: v_pk_max_f16 v2, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v4, v3, v3
-; GFX11-NEXT: v_pk_max_f16 v2, v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -4600,14 +4600,14 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: ds_read_b32 v2, v0
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_pk_max_f16 v2, v1, v1
-; GFX10-NEXT: v_pk_max_f16 v4, v3, v3
-; GFX10-NEXT: v_pk_max_f16 v2, v4, v2
+; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -4626,16 +4626,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ds_read_b32 v2, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1
; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1
-; GFX90A-NEXT: v_pk_max_f16 v2, v4, v4
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
@@ -4649,16 +4649,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: ds_read_b32 v2, v0
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v1, v1, v1
; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_pk_max_f16 v3, v1, v1
-; GFX908-NEXT: v_pk_max_f16 v2, v4, v4
-; GFX908-NEXT: v_pk_max_f16 v2, v2, v3
-; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v1
+; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB20_1
@@ -4673,17 +4673,17 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v2, v0
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v2, v1, v1
-; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v4, v4
-; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v2, v6, v2
-; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v4, v4
+; GFX8-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v1
+; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -4792,15 +4792,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v4, v3, v3
-; GFX12-NEXT: v_pk_max_num_f16 v2, v4, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -4822,17 +4822,17 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v1, v1, v1
; GFX940-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_pk_max_f16 v3, v1, v1
-; GFX940-NEXT: v_pk_max_f16 v2, v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_max_f16 v2, v2, v3
-; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v1
+; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB21_1
@@ -4845,15 +4845,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: v_pk_max_f16 v2, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v4, v3, v3
-; GFX11-NEXT: v_pk_max_f16 v2, v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -4872,14 +4872,14 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_pk_max_f16 v2, v1, v1
-; GFX10-NEXT: v_pk_max_f16 v4, v3, v3
-; GFX10-NEXT: v_pk_max_f16 v2, v4, v2
+; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -4898,16 +4898,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1
; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1
-; GFX90A-NEXT: v_pk_max_f16 v2, v4, v4
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
@@ -4921,16 +4921,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v1, v1, v1
; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_pk_max_f16 v3, v1, v1
-; GFX908-NEXT: v_pk_max_f16 v2, v4, v4
-; GFX908-NEXT: v_pk_max_f16 v2, v2, v3
-; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT: v_pk_max_f16 v2, v2, v1
+; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB21_1
@@ -4945,17 +4945,17 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v2, v1, v1
-; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v4, v4
-; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v2, v6, v2
-; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v4, v4
+; GFX8-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v1
+; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5064,14 +5064,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ds_load_b32 v2, v0
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v3, v1, v1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2
+; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v3
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -5093,13 +5093,13 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ds_read_b32 v2, v0
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v1, v1, v1
; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v3, v1, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
+; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_max_f16 v3, v4, v3
+; GFX940-NEXT: v_pk_max_f16 v3, v3, v1
; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5115,14 +5115,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: ds_load_b32 v2, v0
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v3, v1, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
+; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v3, v4, v3
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -5141,13 +5141,13 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: ds_read_b32 v2, v0
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v3, v1, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v4, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v3, v4, v3
+; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX10-NEXT: v_pk_max_f16 v3, v3, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -5166,12 +5166,12 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ds_read_b32 v2, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT: v_pk_max_f16 v3, v4, v3
+; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v3, v3, v1
; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5188,12 +5188,12 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: ds_read_b32 v2, v0
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v1, v1, v1
; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v3, v1, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT: v_pk_max_f16 v3, v4, v3
+; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX908-NEXT: v_pk_max_f16 v3, v3, v1
; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5209,23 +5209,23 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v2, v0
+; GFX8-NEXT: ds_read_b32 v3, v0
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v5, v1, v1
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v4, v6, v5
-; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v3, v3
+; GFX8-NEXT: v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v1
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB22_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5326,14 +5326,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v3, v1, v1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2
+; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v3
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -5355,13 +5355,13 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v1, v1, v1
; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v3, v1, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
+; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_max_f16 v3, v4, v3
+; GFX940-NEXT: v_pk_max_f16 v3, v3, v1
; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5377,14 +5377,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v3, v1, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
+; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v3, v4, v3
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -5403,13 +5403,13 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v3, v1, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v4, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v3, v4, v3
+; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX10-NEXT: v_pk_max_f16 v3, v3, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -5428,12 +5428,12 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1
; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT: v_pk_max_f16 v3, v4, v3
+; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v3, v3, v1
; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5450,12 +5450,12 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v1, v1, v1
; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v3, v1, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT: v_pk_max_f16 v3, v4, v3
+; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX908-NEXT: v_pk_max_f16 v3, v3, v1
; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5471,23 +5471,23 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v5, v1, v1
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT: v_max_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v4, v6, v5
-; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v3, v3
+; GFX8-NEXT: v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v1
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB23_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5594,36 +5594,37 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ds_load_b32 v2, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: v_dual_max_num_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v5, v2
-; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -5640,30 +5641,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ds_read_b32 v2, v0
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX940-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX940-NEXT: v_max_f32_e32 v2, v6, v2
-; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v3
+; GFX940-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX940-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1]
-; GFX940-NEXT: v_perm_b32 v2, v3, v2, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v5, v2, s5
; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5679,38 +5680,39 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: ds_load_b32 v2, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: v_dual_max_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v2, v5, v2
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -5725,33 +5727,33 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: ds_read_b32 v2, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v5, v2
-; GFX10-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT: v_mov_b32_e32 v4, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v3
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4
-; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB24_1
@@ -5765,29 +5767,29 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ds_read_b32 v2, v0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX90A-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v2, v6, v2
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v3
+; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9
; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5804,29 +5806,29 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: ds_read_b32 v2, v0
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_max_f32_e32 v2, v6, v2
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v3
+; GFX908-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9
; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5844,30 +5846,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v2, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_max_f32_e32 v2, v6, v2
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_max_f32_e32 v2, v2, v3
+; GFX8-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5969,36 +5971,37 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: v_dual_max_num_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v5, v2
-; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6015,30 +6018,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX940-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX940-NEXT: v_max_f32_e32 v2, v6, v2
-; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX940-NEXT: v_max_f32_e32 v2, v2, v3
+; GFX940-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX940-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1]
-; GFX940-NEXT: v_perm_b32 v2, v3, v2, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v5, v2, s5
; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6054,38 +6057,39 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: v_dual_max_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v2, v5, v2
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6100,33 +6104,33 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v5, v2
-; GFX10-NEXT: v_max_f32_e32 v4, v6, v4
-; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT: v_mov_b32_e32 v4, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v3
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4
-; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB25_1
@@ -6140,29 +6144,29 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX90A-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v2, v6, v2
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v3
+; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9
; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6179,29 +6183,29 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_max_f32_e32 v2, v6, v2
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v3
+; GFX908-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9
; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6219,30 +6223,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT: v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_max_f32_e32 v2, v6, v2
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_max_f32_e32 v2, v2, v3
+; GFX8-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6344,37 +6348,37 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v2, v0
+; GFX12-NEXT: ds_load_b32 v3, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_max_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_max_num_f32_e32 v4, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-NEXT: v_max_num_f32_e32 v4, v4, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6388,36 +6392,36 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX940-LABEL: local_atomic_fmax_noret_v2bf16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b32 v2, v0
+; GFX940-NEXT: ds_read_b32 v3, v0
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v6, v5
-; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT: v_add3_u32 v7, v7, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX940-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX940-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1]
-; GFX940-NEXT: v_perm_b32 v3, v4, v3, s5
-; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1]
+; GFX940-NEXT: v_perm_b32 v4, v5, v4, s5
+; GFX940-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB26_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6427,39 +6431,39 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-LABEL: local_atomic_fmax_noret_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v2, v0
+; GFX11-NEXT: ds_load_b32 v3, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_max_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_max_f32_e32 v4, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v4
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6472,34 +6476,34 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX10-LABEL: local_atomic_fmax_noret_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v2, v0
+; GFX10-NEXT: ds_read_b32 v3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX10-NEXT: v_max_f32_e32 v4, v6, v5
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v1
; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB26_1
@@ -6510,35 +6514,35 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX90A-LABEL: local_atomic_fmax_noret_v2bf16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0
+; GFX90A-NEXT: ds_read_b32 v3, v0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6548,35 +6552,35 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX908-LABEL: local_atomic_fmax_noret_v2bf16:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b32 v2, v0
+; GFX908-NEXT: ds_read_b32 v3, v0
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX908-NEXT: v_max_f32_e32 v4, v6, v5
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX908-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB26_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6587,36 +6591,36 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v2, v0
+; GFX8-NEXT: ds_read_b32 v3, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_max_f32_e32 v4, v6, v5
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX8-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB26_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6708,37 +6712,37 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX12-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_max_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_max_num_f32_e32 v4, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-NEXT: v_max_num_f32_e32 v4, v4, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6752,36 +6756,36 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX940-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX940-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_max_f32_e32 v4, v6, v5
-; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT: v_add3_u32 v7, v7, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX940-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX940-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1]
-; GFX940-NEXT: v_perm_b32 v3, v4, v3, s5
-; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1]
+; GFX940-NEXT: v_perm_b32 v4, v5, v4, s5
+; GFX940-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB27_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6791,39 +6795,39 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_max_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_max_f32_e32 v4, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v4
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6836,34 +6840,34 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX10-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT: ds_read_b32 v3, v0 offset:65532
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX10-NEXT: v_max_f32_e32 v4, v6, v5
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v1
; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB27_1
@@ -6874,35 +6878,35 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX90A-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_max_f32_e32 v4, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6912,35 +6916,35 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX908-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX908-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX908-NEXT: v_max_f32_e32 v4, v6, v5
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX908-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB27_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6951,36 +6955,36 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_max_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_max_f32_e32 v4, v6, v5
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX8-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB27_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
index 1c9cff7326d653..282947afa409a8 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
@@ -4520,15 +4520,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ds_load_b32 v2, v0
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v4, v3, v3
-; GFX12-NEXT: v_pk_min_num_f16 v2, v4, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
+; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -4550,17 +4550,17 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ds_read_b32 v2, v0
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v1, v1, v1
; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_pk_max_f16 v3, v1, v1
-; GFX940-NEXT: v_pk_max_f16 v2, v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_min_f16 v2, v2, v3
-; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX940-NEXT: v_pk_min_f16 v2, v2, v1
+; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB20_1
@@ -4573,15 +4573,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: ds_load_b32 v2, v0
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: v_pk_max_f16 v2, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v4, v3, v3
-; GFX11-NEXT: v_pk_min_f16 v2, v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX11-NEXT: v_pk_min_f16 v2, v2, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -4600,14 +4600,14 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: ds_read_b32 v2, v0
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_pk_max_f16 v2, v1, v1
-; GFX10-NEXT: v_pk_max_f16 v4, v3, v3
-; GFX10-NEXT: v_pk_min_f16 v2, v4, v2
+; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT: v_pk_min_f16 v2, v2, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -4626,16 +4626,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ds_read_b32 v2, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1
; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1
-; GFX90A-NEXT: v_pk_max_f16 v2, v4, v4
-; GFX90A-NEXT: v_pk_min_f16 v2, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
@@ -4649,16 +4649,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: ds_read_b32 v2, v0
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v1, v1, v1
; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_pk_max_f16 v3, v1, v1
-; GFX908-NEXT: v_pk_max_f16 v2, v4, v4
-; GFX908-NEXT: v_pk_min_f16 v2, v2, v3
-; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT: v_pk_min_f16 v2, v2, v1
+; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB20_1
@@ -4673,17 +4673,17 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v2, v0
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v2, v1, v1
-; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v4, v4
-; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v2, v6, v2
-; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v4, v4
+; GFX8-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v1
+; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -4792,15 +4792,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v4, v3, v3
-; GFX12-NEXT: v_pk_min_num_f16 v2, v4, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
+; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -4822,17 +4822,17 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v1, v1, v1
; GFX940-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_pk_max_f16 v3, v1, v1
-; GFX940-NEXT: v_pk_max_f16 v2, v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: v_pk_max_f16 v2, v3, v3
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_min_f16 v2, v2, v3
-; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
+; GFX940-NEXT: v_pk_min_f16 v2, v2, v1
+; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB21_1
@@ -4845,15 +4845,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: v_pk_max_f16 v2, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v4, v3, v3
-; GFX11-NEXT: v_pk_min_f16 v2, v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX11-NEXT: v_pk_min_f16 v2, v2, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -4872,14 +4872,14 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_pk_max_f16 v2, v1, v1
-; GFX10-NEXT: v_pk_max_f16 v4, v3, v3
-; GFX10-NEXT: v_pk_min_f16 v2, v4, v2
+; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT: v_pk_min_f16 v2, v2, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -4898,16 +4898,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1
; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1
-; GFX90A-NEXT: v_pk_max_f16 v2, v4, v4
-; GFX90A-NEXT: v_pk_min_f16 v2, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
@@ -4921,16 +4921,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v1, v1, v1
; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_pk_max_f16 v3, v1, v1
-; GFX908-NEXT: v_pk_max_f16 v2, v4, v4
-; GFX908-NEXT: v_pk_min_f16 v2, v2, v3
-; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT: v_pk_min_f16 v2, v2, v1
+; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB21_1
@@ -4945,17 +4945,17 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v2, v1, v1
-; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v6, v4, v4
-; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v2, v6, v2
-; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v4, v4
+; GFX8-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v1
+; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5064,14 +5064,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ds_load_b32 v2, v0
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v3, v1, v1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2
+; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v3
+; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -5093,13 +5093,13 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ds_read_b32 v2, v0
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v1, v1, v1
; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v3, v1, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
+; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_min_f16 v3, v4, v3
+; GFX940-NEXT: v_pk_min_f16 v3, v3, v1
; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5115,14 +5115,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: ds_load_b32 v2, v0
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v3, v1, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
+; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_min_f16 v3, v4, v3
+; GFX11-NEXT: v_pk_min_f16 v3, v3, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -5141,13 +5141,13 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: ds_read_b32 v2, v0
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v3, v1, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v4, v2, v2
-; GFX10-NEXT: v_pk_min_f16 v3, v4, v3
+; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX10-NEXT: v_pk_min_f16 v3, v3, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -5166,12 +5166,12 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ds_read_b32 v2, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT: v_pk_min_f16 v3, v4, v3
+; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX90A-NEXT: v_pk_min_f16 v3, v3, v1
; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5188,12 +5188,12 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: ds_read_b32 v2, v0
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v1, v1, v1
; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v3, v1, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT: v_pk_min_f16 v3, v4, v3
+; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX908-NEXT: v_pk_min_f16 v3, v3, v1
; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5209,23 +5209,23 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v2, v0
+; GFX8-NEXT: ds_read_b32 v3, v0
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v5, v1, v1
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT: v_min_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v4, v6, v5
-; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v3, v3
+; GFX8-NEXT: v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v1
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB22_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5326,14 +5326,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_pk_max_num_f16 v3, v1, v1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2
+; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v3
+; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -5355,13 +5355,13 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_pk_max_f16 v1, v1, v1
; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_pk_max_f16 v3, v1, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_pk_max_f16 v4, v2, v2
+; GFX940-NEXT: v_pk_max_f16 v3, v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_min_f16 v3, v4, v3
+; GFX940-NEXT: v_pk_min_f16 v3, v3, v1
; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5377,14 +5377,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_pk_max_f16 v3, v1, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
+; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_min_f16 v3, v4, v3
+; GFX11-NEXT: v_pk_min_f16 v3, v3, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -5403,13 +5403,13 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_pk_max_f16 v3, v1, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v4, v2, v2
-; GFX10-NEXT: v_pk_min_f16 v3, v4, v3
+; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX10-NEXT: v_pk_min_f16 v3, v3, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -5428,12 +5428,12 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1
; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT: v_pk_min_f16 v3, v4, v3
+; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX90A-NEXT: v_pk_min_f16 v3, v3, v1
; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5450,12 +5450,12 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_pk_max_f16 v1, v1, v1
; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_pk_max_f16 v3, v1, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT: v_pk_min_f16 v3, v4, v3
+; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX908-NEXT: v_pk_min_f16 v3, v3, v1
; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
@@ -5471,23 +5471,23 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v5, v1, v1
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT: v_min_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v4, v6, v5
-; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v3, v3
+; GFX8-NEXT: v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v1
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB23_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5594,36 +5594,37 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ds_load_b32 v2, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: v_dual_min_num_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v5, v2
-; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -5640,30 +5641,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ds_read_b32 v2, v0
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX940-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX940-NEXT: v_min_f32_e32 v2, v6, v2
-; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v3
+; GFX940-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX940-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1]
-; GFX940-NEXT: v_perm_b32 v2, v3, v2, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v5, v2, s5
; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5679,38 +5680,39 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: ds_load_b32 v2, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: v_dual_min_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v2, v5, v2
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-NEXT: v_min_f32_e32 v2, v2, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -5725,33 +5727,33 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: ds_read_b32 v2, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v5, v2
-; GFX10-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT: v_mov_b32_e32 v4, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT: v_min_f32_e32 v2, v2, v3
+; GFX10-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4
-; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB24_1
@@ -5765,29 +5767,29 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ds_read_b32 v2, v0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX90A-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX90A-NEXT: v_min_f32_e32 v2, v6, v2
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v3
+; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9
; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5804,29 +5806,29 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: ds_read_b32 v2, v0
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_min_f32_e32 v2, v6, v2
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v3
+; GFX908-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9
; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5844,30 +5846,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v2, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_min_f32_e32 v2, v6, v2
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_min_f32_e32 v2, v2, v3
+; GFX8-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -5969,36 +5971,37 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: v_dual_min_num_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v5, v2
-; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6015,30 +6018,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX940-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX940-NEXT: v_min_f32_e32 v2, v6, v2
-; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX940-NEXT: v_min_f32_e32 v2, v2, v3
+; GFX940-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX940-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1]
-; GFX940-NEXT: v_perm_b32 v2, v3, v2, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v5, v2, s5
; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6054,38 +6057,39 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: v_dual_min_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v2, v5, v2
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-NEXT: v_min_f32_e32 v2, v2, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6100,33 +6104,33 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v5, v2
-; GFX10-NEXT: v_min_f32_e32 v4, v6, v4
-; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT: v_mov_b32_e32 v4, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT: v_min_f32_e32 v2, v2, v3
+; GFX10-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4
-; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB25_1
@@ -6140,29 +6144,29 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX90A-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX90A-NEXT: v_min_f32_e32 v2, v6, v2
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v3
+; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9
; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6179,29 +6183,29 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_min_f32_e32 v2, v6, v2
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v3
+; GFX908-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9
; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6219,30 +6223,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT: v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_min_f32_e32 v2, v6, v2
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_min_f32_e32 v2, v2, v3
+; GFX8-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6344,37 +6348,37 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v2, v0
+; GFX12-NEXT: ds_load_b32 v3, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_min_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_min_num_f32_e32 v4, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-NEXT: v_min_num_f32_e32 v4, v4, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6388,36 +6392,36 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX940-LABEL: local_atomic_fmin_noret_v2bf16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b32 v2, v0
+; GFX940-NEXT: ds_read_b32 v3, v0
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_min_f32_e32 v4, v6, v5
-; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT: v_add3_u32 v7, v7, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX940-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX940-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1]
-; GFX940-NEXT: v_perm_b32 v3, v4, v3, s5
-; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1]
+; GFX940-NEXT: v_perm_b32 v4, v5, v4, s5
+; GFX940-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB26_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6427,39 +6431,39 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-LABEL: local_atomic_fmin_noret_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v2, v0
+; GFX11-NEXT: ds_load_b32 v3, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_min_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_min_f32_e32 v4, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v4
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6472,34 +6476,34 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX10-LABEL: local_atomic_fmin_noret_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v2, v0
+; GFX10-NEXT: ds_read_b32 v3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX10-NEXT: v_min_f32_e32 v4, v6, v5
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX10-NEXT: v_min_f32_e32 v5, v5, v1
; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB26_1
@@ -6510,35 +6514,35 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX90A-LABEL: local_atomic_fmin_noret_v2bf16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0
+; GFX90A-NEXT: ds_read_b32 v3, v0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_min_f32_e32 v4, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6548,35 +6552,35 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX908-LABEL: local_atomic_fmin_noret_v2bf16:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b32 v2, v0
+; GFX908-NEXT: ds_read_b32 v3, v0
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX908-NEXT: v_min_f32_e32 v4, v6, v5
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX908-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB26_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6587,36 +6591,36 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v2, v0
+; GFX8-NEXT: ds_read_b32 v3, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_min_f32_e32 v4, v6, v5
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX8-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB26_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6708,37 +6712,37 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX12-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_min_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_min_num_f32_e32 v4, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-NEXT: v_min_num_f32_e32 v4, v4, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6752,36 +6756,36 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX940-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX940-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_min_f32_e32 v4, v6, v5
-; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT: v_add3_u32 v7, v7, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX940-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX940-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1]
-; GFX940-NEXT: v_perm_b32 v3, v4, v3, s5
-; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1]
+; GFX940-NEXT: v_perm_b32 v4, v5, v4, s5
+; GFX940-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB27_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6791,39 +6795,39 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_min_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_min_f32_e32 v4, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v4
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6836,34 +6840,34 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX10-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT: ds_read_b32 v3, v0 offset:65532
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX10-NEXT: v_min_f32_e32 v4, v6, v5
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX10-NEXT: v_min_f32_e32 v5, v5, v1
; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB27_1
@@ -6874,35 +6878,35 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX90A-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_min_f32_e32 v4, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6912,35 +6916,35 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX908-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX908-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX908-NEXT: v_min_f32_e32 v4, v6, v5
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX908-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB27_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6951,36 +6955,36 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_min_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_min_f32_e32 v4, v6, v5
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX8-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB27_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
index 2433eca80b23ca..1b08b64b046b48 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
@@ -6390,36 +6390,37 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ds_load_b32 v2, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: v_dual_sub_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v2, v5, v2
-; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX12-NEXT: v_sub_f32_e32 v2, v2, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6436,30 +6437,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ds_read_b32 v2, v0
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX940-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX940-NEXT: v_sub_f32_e32 v2, v6, v2
-; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX940-NEXT: v_sub_f32_e32 v2, v2, v3
+; GFX940-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX940-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1]
-; GFX940-NEXT: v_perm_b32 v2, v3, v2, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v5, v2, s5
; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6475,38 +6476,39 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: ds_load_b32 v2, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: v_dual_sub_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v2, v5, v2
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-NEXT: v_sub_f32_e32 v2, v2, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6521,33 +6523,33 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: ds_read_b32 v2, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT: v_sub_f32_e32 v2, v5, v2
-; GFX10-NEXT: v_sub_f32_e32 v4, v6, v4
-; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT: v_mov_b32_e32 v4, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT: v_sub_f32_e32 v2, v2, v3
+; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4
-; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB24_1
@@ -6561,29 +6563,29 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ds_read_b32 v2, v0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX90A-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX90A-NEXT: v_sub_f32_e32 v2, v6, v2
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v3
+; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9
; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6600,29 +6602,29 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: ds_read_b32 v2, v0
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_sub_f32_e32 v2, v6, v2
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT: v_sub_f32_e32 v2, v2, v3
+; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9
; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6640,30 +6642,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v2, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_sub_f32_e32 v2, v6, v2
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_sub_f32_e32 v2, v2, v3
+; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6765,36 +6767,37 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: v_dual_sub_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v2, v5, v2
-; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX12-NEXT: v_sub_f32_e32 v2, v2, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6811,30 +6814,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX940-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX940-NEXT: v_sub_f32_e32 v2, v6, v2
-; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX940-NEXT: v_sub_f32_e32 v2, v2, v3
+; GFX940-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX940-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1]
-; GFX940-NEXT: v_perm_b32 v2, v3, v2, s5
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1]
+; GFX940-NEXT: v_perm_b32 v2, v5, v2, s5
; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6850,38 +6853,39 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: v_dual_sub_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v2, v5, v2
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-NEXT: v_sub_f32_e32 v2, v2, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -6896,33 +6900,33 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT: v_sub_f32_e32 v2, v5, v2
-; GFX10-NEXT: v_sub_f32_e32 v4, v6, v4
-; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT: v_mov_b32_e32 v4, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT: v_sub_f32_e32 v2, v2, v3
+; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4
-; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB25_1
@@ -6936,29 +6940,29 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX90A-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX90A-NEXT: v_sub_f32_e32 v2, v6, v2
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v3
+; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9
; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -6975,29 +6979,29 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX908-NEXT: v_sub_f32_e32 v2, v6, v2
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT: v_sub_f32_e32 v2, v2, v3
+; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9
; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -7015,30 +7019,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_sub_f32_e32 v2, v6, v2
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_sub_f32_e32 v2, v2, v3
+; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -7140,37 +7144,37 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v2, v0
+; GFX12-NEXT: ds_load_b32 v3, v0
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_sub_f32_e32 v4, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -7184,36 +7188,36 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX940-LABEL: local_atomic_fsub_noret_v2bf16:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b32 v2, v0
+; GFX940-NEXT: ds_read_b32 v3, v0
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_sub_f32_e32 v4, v6, v5
-; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT: v_add3_u32 v7, v7, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX940-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX940-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1]
-; GFX940-NEXT: v_perm_b32 v3, v4, v3, s5
-; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1]
+; GFX940-NEXT: v_perm_b32 v4, v5, v4, s5
+; GFX940-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB26_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7223,39 +7227,39 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-LABEL: local_atomic_fsub_noret_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v2, v0
+; GFX11-NEXT: ds_load_b32 v3, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_sub_f32_e32 v4, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v4
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -7268,34 +7272,34 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX10-LABEL: local_atomic_fsub_noret_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v2, v0
+; GFX10-NEXT: ds_read_b32 v3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX10-NEXT: v_sub_f32_e32 v4, v6, v5
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1
; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB26_1
@@ -7306,35 +7310,35 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX90A-LABEL: local_atomic_fsub_noret_v2bf16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0
+; GFX90A-NEXT: ds_read_b32 v3, v0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7344,35 +7348,35 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX908-LABEL: local_atomic_fsub_noret_v2bf16:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b32 v2, v0
+; GFX908-NEXT: ds_read_b32 v3, v0
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX908-NEXT: v_sub_f32_e32 v4, v6, v5
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB26_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7383,36 +7387,36 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v2, v0
+; GFX8-NEXT: ds_read_b32 v3, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_sub_f32_e32 v4, v6, v5
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB26_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7504,37 +7508,37 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX12-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX12-NEXT: v_sub_f32_e32 v4, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -7548,36 +7552,36 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX940-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX940-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX940-NEXT: s_movk_i32 s4, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX940-NEXT: s_mov_b32 s5, 0x7060302
; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX940-NEXT: v_sub_f32_e32 v4, v6, v5
-; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX940-NEXT: v_add3_u32 v7, v7, v4, s4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX940-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX940-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1]
-; GFX940-NEXT: v_perm_b32 v3, v4, v3, s5
-; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1]
+; GFX940-NEXT: v_perm_b32 v4, v5, v4, s5
+; GFX940-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX940-NEXT: s_cbranch_execnz .LBB27_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7587,39 +7591,39 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX11-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX11-NEXT: v_sub_f32_e32 v4, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v4
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -7632,34 +7636,34 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX10-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX10-NEXT: ds_read_b32 v3, v0 offset:65532
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX10-NEXT: v_sub_f32_e32 v4, v6, v5
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1
; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB27_1
@@ -7670,35 +7674,35 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX90A-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7708,35 +7712,35 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX908-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX908-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX908-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX908-NEXT: v_sub_f32_e32 v4, v6, v5
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8
-; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB27_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7747,36 +7751,36 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT: v_sub_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_sub_f32_e32 v4, v6, v5
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB27_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
index be8a6295c8a71c..8157b1a7f7c802 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
@@ -161,13 +161,13 @@ define void @issue63986_reduced_expanded(i64 %idxprom) {
; CHECK-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; CHECK-NEXT: s_cbranch_execnz .LBB1_8
; CHECK-NEXT: .LBB1_5: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT: s_mov_b64 s[6:7], 0
-; CHECK-NEXT: .LBB1_6: ; %loop-memcpy-residual
-; CHECK-NEXT: s_add_u32 s6, s6, 1
; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: s_mov_b64 s[6:7], 0
; CHECK-NEXT: v_mov_b32_e32 v1, s5
-; CHECK-NEXT: s_addc_u32 s7, s7, 0
-; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
+; CHECK-NEXT: .LBB1_6: ; %loop-memcpy-residual
+; CHECK-NEXT: s_add_u32 s4, s6, 1
+; CHECK-NEXT: s_addc_u32 s5, s7, 0
+; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
; CHECK-NEXT: s_mov_b64 s[6:7], 1
; CHECK-NEXT: s_cbranch_vccnz .LBB1_6
; CHECK-NEXT: ; %bb.7: ; %Flow
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
index 86a6ad46e87683..a9b8663a48dea0 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
@@ -5,21 +5,23 @@
define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
; GFX942-LABEL: matmul_kernel:
; GFX942: ; %bb.0: ; %entry
-; GFX942-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NEXT: v_mov_b32_e32 v1, 0
-; GFX942-NEXT: s_mov_b32 s0, 0
+; GFX942-NEXT: s_mov_b32 s2, 0
; GFX942-NEXT: v_accvgpr_write_b32 a0, v1
-; GFX942-NEXT: s_mov_b32 s1, 0
+; GFX942-NEXT: s_mov_b32 s3, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_cmp_lg_u32 s2, 0
-; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0
; GFX942-NEXT: s_branch .LBB0_2
; GFX942-NEXT: .LBB0_1: ; %bb2
; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1
-; GFX942-NEXT: s_or_b32 s4, s1, 1
-; GFX942-NEXT: s_ashr_i32 s5, s1, 31
-; GFX942-NEXT: s_mov_b32 s1, s0
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-NEXT: s_or_b32 s4, s3, 1
+; GFX942-NEXT: s_ashr_i32 s5, s3, 31
+; GFX942-NEXT: s_mov_b32 s3, s2
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
; GFX942-NEXT: v_accvgpr_read_b32 v0, a0
; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: v_mov_b32_e32 v3, v1
@@ -27,54 +29,56 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
; GFX942-NEXT: v_accvgpr_write_b32 a1, v1
; GFX942-NEXT: v_accvgpr_write_b32 a2, v2
; GFX942-NEXT: v_accvgpr_write_b32 a3, v3
-; GFX942-NEXT: s_and_b32 s1, s5, s4
+; GFX942-NEXT: s_and_b32 s3, s5, s4
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[4:5], a[0:3]
; GFX942-NEXT: s_cbranch_execz .LBB0_4
; GFX942-NEXT: .LBB0_2: ; %bb
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1]
; GFX942-NEXT: s_cbranch_vccz .LBB0_1
; GFX942-NEXT: ; %bb.3:
-; GFX942-NEXT: ; implicit-def: $sgpr1
+; GFX942-NEXT: ; implicit-def: $sgpr3
; GFX942-NEXT: .LBB0_4: ; %common.ret
; GFX942-NEXT: s_endpgm
;
; GFX908-LABEL: matmul_kernel:
; GFX908: ; %bb.0: ; %entry
-; GFX908-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX908-NEXT: v_mov_b32_e32 v1, 0
-; GFX908-NEXT: s_mov_b32 s0, 0
-; GFX908-NEXT: s_mov_b32 s1, 0
+; GFX908-NEXT: s_mov_b32 s2, 0
+; GFX908-NEXT: s_mov_b32 s3, 0
; GFX908-NEXT: v_accvgpr_write_b32 a0, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_cmp_lg_u32 s2, 0
-; GFX908-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX908-NEXT: s_cmp_lg_u32 s0, 0
+; GFX908-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0
; GFX908-NEXT: s_branch .LBB0_2
; GFX908-NEXT: .LBB0_1: ; %bb2
; GFX908-NEXT: ; in Loop: Header=BB0_2 Depth=1
-; GFX908-NEXT: s_or_b32 s4, s1, 1
-; GFX908-NEXT: s_ashr_i32 s5, s1, 31
-; GFX908-NEXT: s_mov_b32 s1, s0
+; GFX908-NEXT: s_or_b32 s4, s3, 1
+; GFX908-NEXT: s_ashr_i32 s5, s3, 31
+; GFX908-NEXT: s_mov_b32 s3, s2
; GFX908-NEXT: s_nop 3
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX908-NEXT: v_mov_b32_e32 v5, s1
-; GFX908-NEXT: v_mov_b32_e32 v4, s0
+; GFX908-NEXT: v_mov_b32_e32 v5, s3
+; GFX908-NEXT: v_mov_b32_e32 v4, s2
; GFX908-NEXT: v_mov_b32_e32 v2, v1
; GFX908-NEXT: v_mov_b32_e32 v3, v1
; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
-; GFX908-NEXT: s_and_b32 s1, s5, s4
+; GFX908-NEXT: s_and_b32 s3, s5, s4
; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[4:5], v[4:5], a[0:3]
; GFX908-NEXT: s_cbranch_execz .LBB0_4
; GFX908-NEXT: .LBB0_2: ; %bb
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1]
; GFX908-NEXT: s_cbranch_vccz .LBB0_1
; GFX908-NEXT: ; %bb.3:
-; GFX908-NEXT: ; implicit-def: $sgpr1
+; GFX908-NEXT: ; implicit-def: $sgpr3
; GFX908-NEXT: .LBB0_4: ; %common.ret
; GFX908-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
index c80254210109a7..3e45a2d0df43d6 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
@@ -23,8 +23,10 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
; GCN-NEXT: ; Child Loop BB0_4 Depth 2
; GCN-NEXT: buffer_load_dword v1, off, s[8:11], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1
; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v1
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1
; GCN-NEXT: s_mov_b32 s12, s6
; GCN-NEXT: s_branch .LBB0_4
; GCN-NEXT: .LBB0_3: ; %Flow1
@@ -34,7 +36,7 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
; GCN-NEXT: .LBB0_4: ; %bb2
; GCN-NEXT: ; Parent Loop BB0_2 Depth=1
; GCN-NEXT: ; => This Inner Loop Header: Depth=2
-; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1]
+; GCN-NEXT: s_and_b64 vcc, exec, s[0:1]
; GCN-NEXT: s_lshl_b32 s12, s12, 5
; GCN-NEXT: s_cbranch_vccz .LBB0_6
; GCN-NEXT: ; %bb.5: ; in Loop: Header=BB0_4 Depth=2
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index cb9f6c1f38eea7..96dd6276f7e382 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -1744,8 +1744,8 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_addc_u32_e64 v5, s[4:5], 0, -1, vcc
; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v3, 0
; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff
+; GCN-IR-NEXT: v_mov_b32_e32 v3, 0
; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index c1cc61bbacd0fd..23364e860d1542 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1865,8 +1865,8 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc
; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff
+; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index 15dc5451750553..12eec4fa3bd594 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -32,51 +32,71 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-LABEL: kernel:
; GLOBALNESS1: ; %bb.0: ; %bb
; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[6:7]
-; GLOBALNESS1-NEXT: s_load_dwordx4 s[64:67], s[8:9], 0x0
+; GLOBALNESS1-NEXT: s_load_dwordx4 s[76:79], s[8:9], 0x0
; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14
; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0
; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, 0
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0
; GLOBALNESS1-NEXT: global_store_dword v[0:1], v42, off
; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[64:65]
-; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17
-; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0
+; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[76:77]
; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[4:5]
; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18
; GLOBALNESS1-NEXT: s_load_dword s7, s[8:9], 0x20
-; GLOBALNESS1-NEXT: s_bitcmp1_b32 s66, 0
-; GLOBALNESS1-NEXT: s_cselect_b64 s[68:69], -1, 0
+; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17
; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0
-; GLOBALNESS1-NEXT: s_xor_b64 s[70:71], s[68:69], -1
+; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0
; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x40994400
-; GLOBALNESS1-NEXT: s_bitcmp1_b32 s6, 0
+; GLOBALNESS1-NEXT: s_bitcmp1_b32 s78, 0
; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[42:43], s[4:5], v[0:1]
-; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[64:65], s[4:5], 0
+; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
+; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
+; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1
+; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GLOBALNESS1-NEXT: s_bitcmp1_b32 s6, 0
+; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v0
+; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS1-NEXT: s_xor_b64 s[72:73], s[4:5], -1
+; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GLOBALNESS1-NEXT: s_bitcmp1_b32 s7, 0
+; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v0
+; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS1-NEXT: s_mov_b32 s60, s16
+; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1
+; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0
+; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0
+; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1
+; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3
+; GLOBALNESS1-NEXT: s_mov_b32 s70, s16
; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9]
-; GLOBALNESS1-NEXT: s_mov_b32 s61, s15
-; GLOBALNESS1-NEXT: s_mov_b32 s62, s14
+; GLOBALNESS1-NEXT: s_mov_b32 s71, s15
+; GLOBALNESS1-NEXT: s_mov_b32 s72, s14
; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11]
-; GLOBALNESS1-NEXT: s_xor_b64 s[74:75], s[4:5], -1
; GLOBALNESS1-NEXT: s_mov_b32 s32, 0
; GLOBALNESS1-NEXT: ; implicit-def: $vgpr44_vgpr45
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0)
-; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[44:45], 0, v2
-; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[46:47], 1, v2
-; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[48:49], 1, v2
-; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[50:51], 0, v2
+; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
+; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 1, v2
+; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v0
+; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v1
+; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v3
+; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v2
; GLOBALNESS1-NEXT: s_branch .LBB1_4
; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[50:51]
+; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[60:61]
; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_29
; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
@@ -105,23 +125,23 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41]
; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37]
; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT: s_mov_b32 s12, s62
-; GLOBALNESS1-NEXT: s_mov_b32 s13, s61
-; GLOBALNESS1-NEXT: s_mov_b32 s14, s60
+; GLOBALNESS1-NEXT: s_mov_b32 s12, s72
+; GLOBALNESS1-NEXT: s_mov_b32 s13, s71
+; GLOBALNESS1-NEXT: s_mov_b32 s14, s70
; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41
; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0)
; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[68:69]
+; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[46:47]
; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1
; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_9
; GLOBALNESS1-NEXT: ; %bb.5: ; %NodeBlock
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: s_cmp_lt_i32 s67, 1
+; GLOBALNESS1-NEXT: s_cmp_lt_i32 s79, 1
; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_7
; GLOBALNESS1-NEXT: ; %bb.6: ; %LeafBlock12
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: s_cmp_lg_u32 s67, 1
+; GLOBALNESS1-NEXT: s_cmp_lg_u32 s79, 1
; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1
; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_8
@@ -131,7 +151,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5
; GLOBALNESS1-NEXT: .LBB1_8: ; %LeafBlock
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: s_cmp_lg_u32 s67, 0
+; GLOBALNESS1-NEXT: s_cmp_lg_u32 s79, 0
; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0
; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0
; GLOBALNESS1-NEXT: .LBB1_9: ; %Flow25
@@ -143,15 +163,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0
; GLOBALNESS1-NEXT: flat_load_dword v0, v[2:3]
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[52:53], 0, v0
+; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0
; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0
; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x3ff00000
-; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[76:77], s[52:53]
+; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[74:75], s[62:63]
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_26
; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
-; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[44:45]
+; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[54:55]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13
; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
@@ -163,70 +183,72 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46
; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0)
-; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e64 s[54:55], 0, v[0:1]
-; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[56:57], 0, v2
+; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
+; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[64:65], 0, v2
+; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0
; GLOBALNESS1-NEXT: s_branch .LBB1_16
; GLOBALNESS1-NEXT: .LBB1_14: ; %Flow16
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2
; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5]
; GLOBALNESS1-NEXT: .LBB1_15: ; %bb63.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[74:75]
+; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[52:53]
; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_25
; GLOBALNESS1-NEXT: .LBB1_16: ; %bb44.i
; GLOBALNESS1-NEXT: ; Parent Loop BB1_4 Depth=1
; GLOBALNESS1-NEXT: ; => This Inner Loop Header: Depth=2
-; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[70:71]
+; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[48:49]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15
; GLOBALNESS1-NEXT: ; %bb.17: ; %bb46.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[72:73]
+; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[50:51]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15
; GLOBALNESS1-NEXT: ; %bb.18: ; %bb50.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[42:43]
+; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[42:43]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21
; GLOBALNESS1-NEXT: ; %bb.19: ; %bb3.i.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[64:65]
+; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[44:45]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21
; GLOBALNESS1-NEXT: ; %bb.20: ; %bb6.i.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[54:55]
+; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[66:67]
; GLOBALNESS1-NEXT: .LBB1_21: ; %spam.exit.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[46:47]
+; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[56:57]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15
; GLOBALNESS1-NEXT: ; %bb.22: ; %bb55.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT: s_add_u32 s58, s38, 40
-; GLOBALNESS1-NEXT: s_addc_u32 s59, s39, 0
+; GLOBALNESS1-NEXT: s_add_u32 s68, s38, 40
+; GLOBALNESS1-NEXT: s_addc_u32 s69, s39, 0
; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5]
; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
; GLOBALNESS1-NEXT: s_addc_u32 s5, s5, wobble at gotpcrel32@hi+12
-; GLOBALNESS1-NEXT: s_load_dwordx2 s[78:79], s[4:5], 0x0
+; GLOBALNESS1-NEXT: s_load_dwordx2 s[76:77], s[4:5], 0x0
; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41]
; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37]
-; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[58:59]
+; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[68:69]
; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT: s_mov_b32 s12, s62
-; GLOBALNESS1-NEXT: s_mov_b32 s13, s61
-; GLOBALNESS1-NEXT: s_mov_b32 s14, s60
+; GLOBALNESS1-NEXT: s_mov_b32 s12, s72
+; GLOBALNESS1-NEXT: s_mov_b32 s13, s71
+; GLOBALNESS1-NEXT: s_mov_b32 s14, s70
; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41
; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[78:79]
+; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77]
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[46:47], 0, 0
; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41]
; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37]
-; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[58:59]
+; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[68:69]
; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT: s_mov_b32 s12, s62
-; GLOBALNESS1-NEXT: s_mov_b32 s13, s61
-; GLOBALNESS1-NEXT: s_mov_b32 s14, s60
+; GLOBALNESS1-NEXT: s_mov_b32 s12, s72
+; GLOBALNESS1-NEXT: s_mov_b32 s13, s71
+; GLOBALNESS1-NEXT: s_mov_b32 s14, s70
; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41
; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off
-; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[78:79]
-; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[56:57]
+; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77]
+; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[64:65]
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14
; GLOBALNESS1-NEXT: ; %bb.23: ; %bb62.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2
@@ -242,12 +264,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0
; GLOBALNESS1-NEXT: .LBB1_26: ; %Flow24
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[76:77]
-; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[52:53]
+; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[74:75]
+; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[62:63]
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2
; GLOBALNESS1-NEXT: ; %bb.27: ; %bb67.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[48:49]
+; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[58:59]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1
; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
@@ -271,9 +293,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41]
; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37]
; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT: s_mov_b32 s12, s62
-; GLOBALNESS1-NEXT: s_mov_b32 s13, s61
-; GLOBALNESS1-NEXT: s_mov_b32 s14, s60
+; GLOBALNESS1-NEXT: s_mov_b32 s12, s72
+; GLOBALNESS1-NEXT: s_mov_b32 s13, s71
+; GLOBALNESS1-NEXT: s_mov_b32 s14, s70
; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41
; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17]
; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget at rel32@lo+4
@@ -289,9 +311,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41]
; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37]
; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT: s_mov_b32 s12, s62
-; GLOBALNESS1-NEXT: s_mov_b32 s13, s61
-; GLOBALNESS1-NEXT: s_mov_b32 s14, s60
+; GLOBALNESS1-NEXT: s_mov_b32 s12, s72
+; GLOBALNESS1-NEXT: s_mov_b32 s13, s71
+; GLOBALNESS1-NEXT: s_mov_b32 s14, s70
; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41
; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17]
; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget at rel32@lo+4
@@ -302,51 +324,71 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-LABEL: kernel:
; GLOBALNESS0: ; %bb.0: ; %bb
; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[6:7]
-; GLOBALNESS0-NEXT: s_load_dwordx4 s[64:67], s[8:9], 0x0
+; GLOBALNESS0-NEXT: s_load_dwordx4 s[72:75], s[8:9], 0x0
; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14
; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0
; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, 0
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0
; GLOBALNESS0-NEXT: global_store_dword v[0:1], v42, off
; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[64:65]
-; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17
-; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0
+; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[72:73]
; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[4:5]
; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18
; GLOBALNESS0-NEXT: s_load_dword s7, s[8:9], 0x20
-; GLOBALNESS0-NEXT: s_bitcmp1_b32 s66, 0
-; GLOBALNESS0-NEXT: s_cselect_b64 s[68:69], -1, 0
+; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17
; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0
-; GLOBALNESS0-NEXT: s_xor_b64 s[70:71], s[68:69], -1
+; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0
; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x40994400
-; GLOBALNESS0-NEXT: s_bitcmp1_b32 s6, 0
+; GLOBALNESS0-NEXT: s_bitcmp1_b32 s74, 0
; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[42:43], s[4:5], v[0:1]
-; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[64:65], s[4:5], 0
+; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
+; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
+; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1
+; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GLOBALNESS0-NEXT: s_bitcmp1_b32 s6, 0
+; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v0
+; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS0-NEXT: s_xor_b64 s[72:73], s[4:5], -1
+; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GLOBALNESS0-NEXT: s_bitcmp1_b32 s7, 0
+; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v0
+; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS0-NEXT: s_mov_b32 s58, s16
+; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1
+; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0
+; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0
+; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1
+; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3
+; GLOBALNESS0-NEXT: s_mov_b32 s68, s16
; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9]
-; GLOBALNESS0-NEXT: s_mov_b32 s59, s15
-; GLOBALNESS0-NEXT: s_mov_b32 s60, s14
+; GLOBALNESS0-NEXT: s_mov_b32 s69, s15
+; GLOBALNESS0-NEXT: s_mov_b32 s70, s14
; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11]
-; GLOBALNESS0-NEXT: s_xor_b64 s[74:75], s[4:5], -1
; GLOBALNESS0-NEXT: s_mov_b32 s32, 0
; GLOBALNESS0-NEXT: ; implicit-def: $vgpr44_vgpr45
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0)
-; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[44:45], 0, v2
-; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[46:47], 1, v2
-; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[48:49], 1, v2
-; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[50:51], 0, v2
+; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
+; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 1, v2
+; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v0
+; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v1
+; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v3
+; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v2
; GLOBALNESS0-NEXT: s_branch .LBB1_4
; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[50:51]
+; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[60:61]
; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_29
; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
@@ -375,23 +417,23 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41]
; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37]
; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT: s_mov_b32 s12, s60
-; GLOBALNESS0-NEXT: s_mov_b32 s13, s59
-; GLOBALNESS0-NEXT: s_mov_b32 s14, s58
+; GLOBALNESS0-NEXT: s_mov_b32 s12, s70
+; GLOBALNESS0-NEXT: s_mov_b32 s13, s69
+; GLOBALNESS0-NEXT: s_mov_b32 s14, s68
; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41
; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0)
; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[68:69]
+; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[46:47]
; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1
; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_9
; GLOBALNESS0-NEXT: ; %bb.5: ; %NodeBlock
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: s_cmp_lt_i32 s67, 1
+; GLOBALNESS0-NEXT: s_cmp_lt_i32 s75, 1
; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_7
; GLOBALNESS0-NEXT: ; %bb.6: ; %LeafBlock12
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: s_cmp_lg_u32 s67, 1
+; GLOBALNESS0-NEXT: s_cmp_lg_u32 s75, 1
; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1
; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_8
@@ -401,7 +443,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GLOBALNESS0-NEXT: .LBB1_8: ; %LeafBlock
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: s_cmp_lg_u32 s67, 0
+; GLOBALNESS0-NEXT: s_cmp_lg_u32 s75, 0
; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0
; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0
; GLOBALNESS0-NEXT: .LBB1_9: ; %Flow25
@@ -413,15 +455,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0
; GLOBALNESS0-NEXT: flat_load_dword v0, v[2:3]
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[52:53], 0, v0
+; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0
; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0
; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x3ff00000
-; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[76:77], s[52:53]
+; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[76:77], s[62:63]
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26
; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
-; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[44:45]
+; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[54:55]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13
; GLOBALNESS0-NEXT: ; %bb.12: ; %bb39.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
@@ -433,70 +475,72 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46
; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0)
-; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[54:55], 0, v[0:1]
-; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[56:57], 0, v2
+; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
+; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[64:65], 0, v2
+; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0
; GLOBALNESS0-NEXT: s_branch .LBB1_16
; GLOBALNESS0-NEXT: .LBB1_14: ; %Flow16
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2
; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5]
; GLOBALNESS0-NEXT: .LBB1_15: ; %bb63.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[74:75]
+; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[52:53]
; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_25
; GLOBALNESS0-NEXT: .LBB1_16: ; %bb44.i
; GLOBALNESS0-NEXT: ; Parent Loop BB1_4 Depth=1
; GLOBALNESS0-NEXT: ; => This Inner Loop Header: Depth=2
-; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[70:71]
+; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[48:49]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15
; GLOBALNESS0-NEXT: ; %bb.17: ; %bb46.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[72:73]
+; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[50:51]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15
; GLOBALNESS0-NEXT: ; %bb.18: ; %bb50.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[42:43]
+; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[42:43]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21
; GLOBALNESS0-NEXT: ; %bb.19: ; %bb3.i.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[64:65]
+; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[44:45]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21
; GLOBALNESS0-NEXT: ; %bb.20: ; %bb6.i.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[54:55]
+; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[66:67]
; GLOBALNESS0-NEXT: .LBB1_21: ; %spam.exit.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[46:47]
+; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[56:57]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15
; GLOBALNESS0-NEXT: ; %bb.22: ; %bb55.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT: s_add_u32 s62, s38, 40
-; GLOBALNESS0-NEXT: s_addc_u32 s63, s39, 0
+; GLOBALNESS0-NEXT: s_add_u32 s72, s38, 40
+; GLOBALNESS0-NEXT: s_addc_u32 s73, s39, 0
; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5]
; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
; GLOBALNESS0-NEXT: s_addc_u32 s5, s5, wobble at gotpcrel32@hi+12
; GLOBALNESS0-NEXT: s_load_dwordx2 s[78:79], s[4:5], 0x0
; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41]
; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37]
-; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[62:63]
+; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[72:73]
; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT: s_mov_b32 s12, s60
-; GLOBALNESS0-NEXT: s_mov_b32 s13, s59
-; GLOBALNESS0-NEXT: s_mov_b32 s14, s58
+; GLOBALNESS0-NEXT: s_mov_b32 s12, s70
+; GLOBALNESS0-NEXT: s_mov_b32 s13, s69
+; GLOBALNESS0-NEXT: s_mov_b32 s14, s68
; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41
; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0)
; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79]
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[46:47], 0, 0
; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41]
; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37]
-; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[62:63]
+; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[72:73]
; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT: s_mov_b32 s12, s60
-; GLOBALNESS0-NEXT: s_mov_b32 s13, s59
-; GLOBALNESS0-NEXT: s_mov_b32 s14, s58
+; GLOBALNESS0-NEXT: s_mov_b32 s12, s70
+; GLOBALNESS0-NEXT: s_mov_b32 s13, s69
+; GLOBALNESS0-NEXT: s_mov_b32 s14, s68
; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41
; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off
; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79]
-; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[56:57]
+; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[64:65]
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14
; GLOBALNESS0-NEXT: ; %bb.23: ; %bb62.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2
@@ -513,11 +557,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow24
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[76:77]
-; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[52:53]
+; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[62:63]
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2
; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[48:49]
+; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[58:59]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1
; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
@@ -541,9 +585,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41]
; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37]
; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT: s_mov_b32 s12, s60
-; GLOBALNESS0-NEXT: s_mov_b32 s13, s59
-; GLOBALNESS0-NEXT: s_mov_b32 s14, s58
+; GLOBALNESS0-NEXT: s_mov_b32 s12, s70
+; GLOBALNESS0-NEXT: s_mov_b32 s13, s69
+; GLOBALNESS0-NEXT: s_mov_b32 s14, s68
; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41
; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17]
; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget at rel32@lo+4
@@ -559,9 +603,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41]
; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37]
; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT: s_mov_b32 s12, s60
-; GLOBALNESS0-NEXT: s_mov_b32 s13, s59
-; GLOBALNESS0-NEXT: s_mov_b32 s14, s58
+; GLOBALNESS0-NEXT: s_mov_b32 s12, s70
+; GLOBALNESS0-NEXT: s_mov_b32 s13, s69
+; GLOBALNESS0-NEXT: s_mov_b32 s14, s68
; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41
; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17]
; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget at rel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 089f3d255e87a6..db7d816386a70d 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -1187,8 +1187,8 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc
; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff
+; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
index 945b87b6e1628d..0acee5bd5ac19d 100644
--- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
@@ -52,10 +52,13 @@ define amdgpu_ps float @valley_partially_undef_copy() #0 {
; CHECK-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], 0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT: s_waitcnt expcnt(1)
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1
; CHECK-NEXT: .LBB1_1: ; %bb9
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_andn2_b64 vcc, exec, s[0:1]
+; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
; CHECK-NEXT: s_cbranch_vccnz .LBB1_1
; CHECK-NEXT: ; %bb.2: ; %bb11
; CHECK-NEXT: s_mov_b32 s3, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index bc7b375cd404d9..a794d139063d5f 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -1294,8 +1294,8 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc
; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff
+; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
index 41d324dd7abfc7..0211c5111c31dd 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
@@ -6,15 +6,15 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 {
; GCN-LABEL: vgpr_descriptor_waterfall_loop_idom_update:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8
+; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
; GCN-NEXT: .LBB0_1: ; %bb0
; GCN-NEXT: ; =>This Loop Header: Depth=1
; GCN-NEXT: ; Child Loop BB0_2 Depth 2
-; GCN-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8
-; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
-; GCN-NEXT: s_mov_b32 s5, exec_lo
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: flat_load_dwordx2 v[4:5], v[6:7]
; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GCN-NEXT: s_mov_b32 s5, exec_lo
; GCN-NEXT: .LBB0_2: ; Parent Loop BB0_1 Depth=1
; GCN-NEXT: ; => This Inner Loop Header: Depth=2
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/LoongArch/jr-without-ra.ll b/llvm/test/CodeGen/LoongArch/jr-without-ra.ll
index 2bd89dacb2b37a..d1c4459aaa6ee0 100644
--- a/llvm/test/CodeGen/LoongArch/jr-without-ra.ll
+++ b/llvm/test/CodeGen/LoongArch/jr-without-ra.ll
@@ -20,101 +20,101 @@ define void @jr_without_ra(ptr %rtwdev, ptr %chan, ptr %h2c, i8 %.pre, i1 %cmp.i
; CHECK-NEXT: st.d $s6, $sp, 24 # 8-byte Folded Spill
; CHECK-NEXT: st.d $s7, $sp, 16 # 8-byte Folded Spill
; CHECK-NEXT: st.d $s8, $sp, 8 # 8-byte Folded Spill
-; CHECK-NEXT: move $s6, $zero
-; CHECK-NEXT: move $s1, $zero
+; CHECK-NEXT: move $s7, $zero
+; CHECK-NEXT: move $s0, $zero
; CHECK-NEXT: ld.d $t0, $sp, 184
-; CHECK-NEXT: ld.d $t1, $sp, 176
-; CHECK-NEXT: ld.d $s2, $sp, 168
-; CHECK-NEXT: ld.d $t2, $sp, 160
-; CHECK-NEXT: ld.d $t3, $sp, 152
-; CHECK-NEXT: ld.d $t4, $sp, 144
-; CHECK-NEXT: ld.d $t5, $sp, 136
-; CHECK-NEXT: ld.d $t6, $sp, 128
-; CHECK-NEXT: ld.d $t7, $sp, 120
-; CHECK-NEXT: ld.d $t8, $sp, 112
-; CHECK-NEXT: ld.d $fp, $sp, 104
-; CHECK-NEXT: ld.d $s0, $sp, 96
+; CHECK-NEXT: ld.d $s2, $sp, 176
+; CHECK-NEXT: ld.d $s1, $sp, 168
+; CHECK-NEXT: ld.d $t1, $sp, 160
+; CHECK-NEXT: ld.d $t2, $sp, 152
+; CHECK-NEXT: ld.d $t3, $sp, 144
+; CHECK-NEXT: ld.d $t4, $sp, 136
+; CHECK-NEXT: ld.d $t5, $sp, 128
+; CHECK-NEXT: ld.d $t6, $sp, 120
+; CHECK-NEXT: ld.d $t7, $sp, 112
+; CHECK-NEXT: ld.d $t8, $sp, 104
+; CHECK-NEXT: ld.d $fp, $sp, 96
; CHECK-NEXT: andi $a4, $a4, 1
-; CHECK-NEXT: alsl.d $a6, $a6, $s2, 4
-; CHECK-NEXT: pcalau12i $s2, %pc_hi20(.LJTI0_0)
-; CHECK-NEXT: addi.d $s2, $s2, %pc_lo12(.LJTI0_0)
+; CHECK-NEXT: alsl.d $a6, $a6, $s1, 4
+; CHECK-NEXT: pcalau12i $s1, %pc_hi20(.LJTI0_0)
+; CHECK-NEXT: addi.d $s1, $s1, %pc_lo12(.LJTI0_0)
+; CHECK-NEXT: slli.d $s3, $s2, 2
+; CHECK-NEXT: alsl.d $s2, $s2, $s3, 1
+; CHECK-NEXT: add.d $s2, $t5, $s2
+; CHECK-NEXT: addi.w $s4, $zero, -41
; CHECK-NEXT: ori $s3, $zero, 1
-; CHECK-NEXT: ori $s4, $zero, 50
-; CHECK-NEXT: ori $s5, $zero, 3
-; CHECK-NEXT: lu32i.d $s5, 262144
+; CHECK-NEXT: slli.d $s4, $s4, 3
+; CHECK-NEXT: ori $s6, $zero, 3
+; CHECK-NEXT: lu32i.d $s6, 262144
; CHECK-NEXT: b .LBB0_4
; CHECK-NEXT: .p2align 4, , 16
; CHECK-NEXT: .LBB0_1: # %sw.bb27.i.i
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: ori $s7, $zero, 1
+; CHECK-NEXT: ori $s8, $zero, 1
; CHECK-NEXT: .LBB0_2: # %if.else.i106
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: alsl.d $s8, $s1, $s1, 3
-; CHECK-NEXT: alsl.d $s1, $s8, $s1, 1
-; CHECK-NEXT: add.d $s1, $t0, $s1
-; CHECK-NEXT: ldx.bu $s7, $s1, $s7
+; CHECK-NEXT: alsl.d $s5, $s0, $s0, 3
+; CHECK-NEXT: alsl.d $s0, $s5, $s0, 1
+; CHECK-NEXT: add.d $s0, $t0, $s0
+; CHECK-NEXT: ldx.bu $s8, $s0, $s8
; CHECK-NEXT: .LBB0_3: # %phy_tssi_get_ofdm_de.exit
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: st.b $zero, $t6, 0
-; CHECK-NEXT: st.b $s6, $t4, 0
-; CHECK-NEXT: st.b $zero, $fp, 0
-; CHECK-NEXT: st.b $zero, $t2, 0
+; CHECK-NEXT: st.b $zero, $t5, 0
+; CHECK-NEXT: st.b $s7, $t3, 0
+; CHECK-NEXT: st.b $zero, $t8, 0
+; CHECK-NEXT: st.b $zero, $t1, 0
; CHECK-NEXT: st.b $zero, $a1, 0
-; CHECK-NEXT: st.b $zero, $t3, 0
-; CHECK-NEXT: st.b $s7, $a5, 0
-; CHECK-NEXT: ori $s1, $zero, 1
-; CHECK-NEXT: move $s6, $a3
+; CHECK-NEXT: st.b $zero, $t2, 0
+; CHECK-NEXT: st.b $s8, $a5, 0
+; CHECK-NEXT: ori $s0, $zero, 1
+; CHECK-NEXT: move $s7, $a3
; CHECK-NEXT: .LBB0_4: # %for.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: beqz $a4, .LBB0_9
; CHECK-NEXT: # %bb.5: # %calc_6g.i
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: move $s6, $zero
+; CHECK-NEXT: move $s7, $zero
; CHECK-NEXT: bnez $zero, .LBB0_8
; CHECK-NEXT: # %bb.6: # %calc_6g.i
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: slli.d $s7, $zero, 3
-; CHECK-NEXT: ldx.d $s7, $s7, $s2
-; CHECK-NEXT: jr $s7
+; CHECK-NEXT: slli.d $s8, $zero, 3
+; CHECK-NEXT: ldx.d $s8, $s8, $s1
+; CHECK-NEXT: jr $s8
; CHECK-NEXT: .LBB0_7: # %sw.bb12.i.i
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: ori $s6, $zero, 1
+; CHECK-NEXT: ori $s7, $zero, 1
; CHECK-NEXT: .LBB0_8: # %if.else58.i
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: ldx.bu $s6, $a6, $s6
+; CHECK-NEXT: ldx.bu $s7, $a6, $s7
; CHECK-NEXT: b .LBB0_11
; CHECK-NEXT: .p2align 4, , 16
; CHECK-NEXT: .LBB0_9: # %if.end.i
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: andi $s6, $s6, 255
-; CHECK-NEXT: bltu $s4, $s6, .LBB0_15
+; CHECK-NEXT: andi $s7, $s7, 255
+; CHECK-NEXT: ori $s5, $zero, 50
+; CHECK-NEXT: bltu $s5, $s7, .LBB0_15
; CHECK-NEXT: # %bb.10: # %if.end.i
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: sll.d $s6, $s3, $s6
-; CHECK-NEXT: and $s7, $s6, $s5
-; CHECK-NEXT: move $s6, $s0
-; CHECK-NEXT: beqz $s7, .LBB0_15
+; CHECK-NEXT: sll.d $s7, $s3, $s7
+; CHECK-NEXT: and $s8, $s7, $s6
+; CHECK-NEXT: move $s7, $fp
+; CHECK-NEXT: beqz $s8, .LBB0_15
; CHECK-NEXT: .LBB0_11: # %phy_tssi_get_ofdm_trim_de.exit
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: move $s7, $zero
-; CHECK-NEXT: st.b $zero, $t8, 0
-; CHECK-NEXT: slli.d $s8, $t1, 2
-; CHECK-NEXT: alsl.d $s8, $t1, $s8, 1
-; CHECK-NEXT: add.d $s8, $t6, $s8
-; CHECK-NEXT: ldx.b $s8, $s8, $t5
+; CHECK-NEXT: move $s8, $zero
+; CHECK-NEXT: st.b $zero, $t7, 0
+; CHECK-NEXT: ldx.b $ra, $s2, $t4
; CHECK-NEXT: st.b $zero, $a2, 0
; CHECK-NEXT: st.b $zero, $a7, 0
-; CHECK-NEXT: st.b $zero, $t7, 0
-; CHECK-NEXT: st.b $s8, $a0, 0
+; CHECK-NEXT: st.b $zero, $t6, 0
+; CHECK-NEXT: st.b $ra, $a0, 0
; CHECK-NEXT: bnez $s3, .LBB0_13
; CHECK-NEXT: # %bb.12: # %phy_tssi_get_ofdm_trim_de.exit
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: addi.w $s8, $zero, -41
-; CHECK-NEXT: slli.d $s8, $s8, 3
; CHECK-NEXT: pcalau12i $ra, %pc_hi20(.LJTI0_1)
; CHECK-NEXT: addi.d $ra, $ra, %pc_lo12(.LJTI0_1)
-; CHECK-NEXT: ldx.d $s8, $s8, $ra
-; CHECK-NEXT: jr $s8
+; CHECK-NEXT: ldx.d $s5, $s4, $ra
+; CHECK-NEXT: jr $s5
; CHECK-NEXT: .LBB0_13: # %phy_tssi_get_ofdm_trim_de.exit
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
; CHECK-NEXT: bnez $s3, .LBB0_1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
index ec2448cb3965f3..c35f05be304cce 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
@@ -489,9 +489,8 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV64-NEXT: j .LBB0_11
; RV64-NEXT: .LBB0_8: # %vector.ph
; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1
-; RV64-NEXT: slli t6, t0, 1
-; RV64-NEXT: slli s0, t0, 28
-; RV64-NEXT: sub t6, s0, t6
+; RV64-NEXT: slli t6, t0, 28
+; RV64-NEXT: sub t6, t6, t1
; RV64-NEXT: and t6, t6, a6
; RV64-NEXT: csrwi vxrm, 0
; RV64-NEXT: mv s0, a2
diff --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
index 6d082802f9cd75..d076cb00ad7e0e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
@@ -353,8 +353,8 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: mov lr, r0
@@ -364,48 +364,50 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: @ %bb.1: @ %for.cond2.preheader.lr.ph
; CHECK-NEXT: movs r0, #1
; CHECK-NEXT: cmp r2, #1
-; CHECK-NEXT: csel r3, r2, r0, lt
+; CHECK-NEXT: csel r7, r2, r0, lt
; CHECK-NEXT: mov r12, r1
-; CHECK-NEXT: mov r1, r3
-; CHECK-NEXT: cmp r3, #3
+; CHECK-NEXT: mov r1, r7
+; CHECK-NEXT: cmp r7, #3
; CHECK-NEXT: it ls
; CHECK-NEXT: movls r1, #3
; CHECK-NEXT: mov r4, r2
-; CHECK-NEXT: subs r1, r1, r3
+; CHECK-NEXT: subs r1, r1, r7
; CHECK-NEXT: movw r2, #43691
; CHECK-NEXT: adds r1, #2
; CHECK-NEXT: movt r2, #43690
-; CHECK-NEXT: ldr r6, [sp, #112]
-; CHECK-NEXT: movw r9, :lower16:c
+; CHECK-NEXT: ldr r6, [sp, #128]
+; CHECK-NEXT: movw r8, :lower16:c
; CHECK-NEXT: umull r1, r2, r1, r2
-; CHECK-NEXT: adr.w r8, .LCPI1_1
+; CHECK-NEXT: movt r8, :upper16:c
; CHECK-NEXT: movs r1, #4
+; CHECK-NEXT: @ implicit-def: $r10
; CHECK-NEXT: @ implicit-def: $r5
; CHECK-NEXT: @ implicit-def: $r11
-; CHECK-NEXT: @ implicit-def: $r7
-; CHECK-NEXT: movt r9, :upper16:c
-; CHECK-NEXT: mov.w r10, #12
-; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: mov.w r9, #12
+; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: add.w r1, r1, r2, lsr #1
; CHECK-NEXT: add.w r0, r0, r2, lsr #1
-; CHECK-NEXT: bic r2, r1, #3
+; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: adr r1, .LCPI1_0
; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vdup.32 q5, r0
+; CHECK-NEXT: adr r1, .LCPI1_1
+; CHECK-NEXT: vldrw.u32 q5, [r1]
; CHECK-NEXT: vdup.32 q6, r0
-; CHECK-NEXT: strd r2, r4, [sp, #4] @ 8-byte Folded Spill
-; CHECK-NEXT: vadd.i32 q4, q0, r3
+; CHECK-NEXT: vadd.i32 q4, q0, r7
+; CHECK-NEXT: vdup.32 q7, r0
+; CHECK-NEXT: strd r3, r7, [sp, #4] @ 8-byte Folded Spill
; CHECK-NEXT: b .LBB1_6
; CHECK-NEXT: .LBB1_2: @ %for.body6.preheader
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: mov r0, r7
-; CHECK-NEXT: cmn.w r7, #4
+; CHECK-NEXT: mov r0, r11
+; CHECK-NEXT: cmn.w r11, #4
; CHECK-NEXT: it le
; CHECK-NEXT: mvnle r0, #3
; CHECK-NEXT: movw r2, #18725
; CHECK-NEXT: adds r0, #6
; CHECK-NEXT: movt r2, #9362
-; CHECK-NEXT: subs r1, r0, r7
+; CHECK-NEXT: sub.w r1, r0, r11
+; CHECK-NEXT: mov r10, r3
; CHECK-NEXT: umull r2, r3, r1, r2
; CHECK-NEXT: subs r2, r1, r3
; CHECK-NEXT: add.w r2, r3, r2, lsr #1
@@ -413,18 +415,19 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: lsls r3, r3, #3
; CHECK-NEXT: sub.w r2, r3, r2, lsr #2
; CHECK-NEXT: subs r1, r2, r1
+; CHECK-NEXT: mov r3, r10
; CHECK-NEXT: add r0, r1
; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup5.loopexit134.split.loop.exit139
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: adds r7, r0, #7
+; CHECK-NEXT: add.w r11, r0, #7
; CHECK-NEXT: .LBB1_4: @ %for.cond.cleanup5
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: movs r5, #0
+; CHECK-NEXT: mov.w r10, #0
; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup5
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: add.w r11, r11, #2
-; CHECK-NEXT: subs.w r1, r11, lr
-; CHECK-NEXT: asr.w r0, r11, #31
+; CHECK-NEXT: adds r5, #2
+; CHECK-NEXT: subs.w r1, r5, lr
+; CHECK-NEXT: asr.w r0, r5, #31
; CHECK-NEXT: sbcs.w r0, r0, r12
; CHECK-NEXT: bge.w .LBB1_28
; CHECK-NEXT: .LBB1_6: @ %for.cond2.preheader
@@ -433,35 +436,36 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: @ Child Loop BB1_10 Depth 2
; CHECK-NEXT: @ Child Loop BB1_12 Depth 3
; CHECK-NEXT: @ Child Loop BB1_14 Depth 3
-; CHECK-NEXT: cmp r7, #2
+; CHECK-NEXT: cmp.w r11, #2
; CHECK-NEXT: bgt .LBB1_5
; CHECK-NEXT: @ %bb.7: @ %for.body6.lr.ph
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: cmp r0, #5
+; CHECK-NEXT: cmp r7, #5
; CHECK-NEXT: bhi .LBB1_17
; CHECK-NEXT: @ %bb.8: @ %for.body6.us.preheader
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: ldrd r2, r3, [sp, #104]
+; CHECK-NEXT: ldrd r2, r3, [sp, #120]
; CHECK-NEXT: movs r0, #32
; CHECK-NEXT: movs r1, #0
-; CHECK-NEXT: mov r6, r12
-; CHECK-NEXT: mov r4, lr
+; CHECK-NEXT: mov r4, r6
+; CHECK-NEXT: mov r7, r12
+; CHECK-NEXT: mov r6, lr
; CHECK-NEXT: bl __aeabi_ldivmod
-; CHECK-NEXT: mov lr, r4
-; CHECK-NEXT: mov r12, r6
-; CHECK-NEXT: ldr r4, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: vdup.32 q0, r2
-; CHECK-NEXT: ldr r6, [sp, #112]
-; CHECK-NEXT: mov r0, r7
+; CHECK-NEXT: mov lr, r6
+; CHECK-NEXT: mov r6, r4
+; CHECK-NEXT: mov r12, r7
; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: vdup.32 q0, r2
+; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: mov r0, r11
; CHECK-NEXT: b .LBB1_10
; CHECK-NEXT: .LBB1_9: @ %for.cond.cleanup17.us
; CHECK-NEXT: @ in Loop: Header=BB1_10 Depth=2
-; CHECK-NEXT: adds r7, r0, #7
+; CHECK-NEXT: add.w r11, r0, #7
; CHECK-NEXT: cmn.w r0, #4
-; CHECK-NEXT: mov.w r5, #0
-; CHECK-NEXT: mov r0, r7
+; CHECK-NEXT: mov.w r10, #0
+; CHECK-NEXT: mov r0, r11
; CHECK-NEXT: bge .LBB1_5
; CHECK-NEXT: .LBB1_10: @ %for.body6.us
; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1
@@ -484,14 +488,13 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1
; CHECK-NEXT: @ Parent Loop BB1_10 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT: vldrw.u32 q2, [r8]
-; CHECK-NEXT: subs r2, #4
-; CHECK-NEXT: vqadd.u32 q2, q2, r1
-; CHECK-NEXT: add.w r1, r1, #4
-; CHECK-NEXT: vcmp.u32 hi, q6, q2
+; CHECK-NEXT: vqadd.u32 q2, q5, r1
+; CHECK-NEXT: adds r1, #4
+; CHECK-NEXT: vcmp.u32 hi, q7, q2
; CHECK-NEXT: vshl.i32 q2, q1, #2
-; CHECK-NEXT: vadd.i32 q2, q2, r9
-; CHECK-NEXT: vadd.i32 q1, q1, r10
+; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: vadd.i32 q2, q2, r8
+; CHECK-NEXT: vadd.i32 q1, q1, r9
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q0, [q2]
; CHECK-NEXT: bne .LBB1_12
@@ -504,14 +507,13 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1
; CHECK-NEXT: @ Parent Loop BB1_10 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT: vldrw.u32 q2, [r8]
-; CHECK-NEXT: subs r2, #4
-; CHECK-NEXT: vqadd.u32 q2, q2, r1
-; CHECK-NEXT: add.w r1, r1, #4
-; CHECK-NEXT: vcmp.u32 hi, q5, q2
+; CHECK-NEXT: vqadd.u32 q2, q5, r1
+; CHECK-NEXT: adds r1, #4
+; CHECK-NEXT: vcmp.u32 hi, q6, q2
; CHECK-NEXT: vshl.i32 q2, q1, #2
-; CHECK-NEXT: vadd.i32 q2, q2, r9
-; CHECK-NEXT: vadd.i32 q1, q1, r10
+; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: vadd.i32 q2, q2, r8
+; CHECK-NEXT: vadd.i32 q1, q1, r9
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q0, [q2]
; CHECK-NEXT: bne .LBB1_14
@@ -521,7 +523,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: beq .LBB1_9
; CHECK-NEXT: @ %bb.16: @ %for.cond9.for.cond15.preheader_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB1_10 Depth=2
-; CHECK-NEXT: eor r1, r5, #1
+; CHECK-NEXT: eor r1, r10, #1
; CHECK-NEXT: lsls r1, r1, #31
; CHECK-NEXT: bne .LBB1_9
; CHECK-NEXT: b .LBB1_26
@@ -530,11 +532,11 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: cmp r6, #0
; CHECK-NEXT: beq.w .LBB1_2
; CHECK-NEXT: @ %bb.18: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: mov r0, r7
+; CHECK-NEXT: mov r0, r11
; CHECK-NEXT: .LBB1_19: @ %for.body6.us60
; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: lsls r1, r5, #31
+; CHECK-NEXT: lsls.w r1, r10, #31
; CHECK-NEXT: bne .LBB1_27
; CHECK-NEXT: @ %bb.20: @ %for.cond.cleanup17.us63
; CHECK-NEXT: @ in Loop: Header=BB1_19 Depth=2
@@ -550,19 +552,19 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: bgt .LBB1_25
; CHECK-NEXT: @ %bb.23: @ %for.cond.cleanup17.us63.3
; CHECK-NEXT: @ in Loop: Header=BB1_19 Depth=2
-; CHECK-NEXT: add.w r7, r0, #28
+; CHECK-NEXT: add.w r11, r0, #28
; CHECK-NEXT: cmn.w r0, #25
-; CHECK-NEXT: mov.w r5, #0
-; CHECK-NEXT: mov r0, r7
+; CHECK-NEXT: mov.w r10, #0
+; CHECK-NEXT: mov r0, r11
; CHECK-NEXT: blt .LBB1_19
; CHECK-NEXT: b .LBB1_5
; CHECK-NEXT: .LBB1_24: @ %for.cond.cleanup5.loopexit134.split.loop.exit137
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: add.w r7, r0, #14
+; CHECK-NEXT: add.w r11, r0, #14
; CHECK-NEXT: b .LBB1_4
; CHECK-NEXT: .LBB1_25: @ %for.cond.cleanup5.loopexit134.split.loop.exit135
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: add.w r7, r0, #21
+; CHECK-NEXT: add.w r11, r0, #21
; CHECK-NEXT: b .LBB1_4
; CHECK-NEXT: .LBB1_26: @ %for.inc19.us
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
@@ -572,7 +574,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: b .LBB1_27
; CHECK-NEXT: .LBB1_28: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #16
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: .p2align 4
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index 83ea44704e63f0..e63c62574dafbc 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -594,71 +594,71 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(ptr noalias nocapture reado
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT: .pad #32
-; CHECK-NEXT: sub sp, #32
+; CHECK-NEXT: .pad #28
+; CHECK-NEXT: sub sp, #28
; CHECK-NEXT: cmp r2, #1
-; CHECK-NEXT: strd r1, r2, [sp, #8] @ 8-byte Folded Spill
+; CHECK-NEXT: strd r1, r2, [sp, #4] @ 8-byte Folded Spill
; CHECK-NEXT: blt .LBB13_5
; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: mov r9, r0
+; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: movs r6, #1
-; CHECK-NEXT: mov.w r10, #8
+; CHECK-NEXT: add r2, sp, #12
+; CHECK-NEXT: mov.w r9, #8
; CHECK-NEXT: bic r1, r1, #7
-; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: sub.w r7, r1, #8
-; CHECK-NEXT: add.w r0, r6, r7, lsr #3
-; CHECK-NEXT: str r0, [sp] @ 4-byte Spill
-; CHECK-NEXT: add r0, sp, #16
+; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
+; CHECK-NEXT: sub.w r3, r1, #8
+; CHECK-NEXT: add.w r8, r6, r3, lsr #3
+; CHECK-NEXT: adr r3, .LCPI13_0
+; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: .LBB13_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB13_3 Depth 2
-; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT: dls lr, r1
-; CHECK-NEXT: adr r1, .LCPI13_0
-; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: dls lr, r8
+; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: .LBB13_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB13_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vstrw.32 q0, [r0]
-; CHECK-NEXT: vadd.i16 q0, q0, r10
-; CHECK-NEXT: vldrh.s32 q1, [r0, #8]
-; CHECK-NEXT: vshl.i32 q1, q1, #1
-; CHECK-NEXT: vadd.i32 q1, q1, r9
-; CHECK-NEXT: vmov r3, r6, d3
-; CHECK-NEXT: vmov r5, r4, d2
-; CHECK-NEXT: vldrh.s32 q1, [r0]
-; CHECK-NEXT: vshl.i32 q1, q1, #1
-; CHECK-NEXT: vadd.i32 q1, q1, r9
-; CHECK-NEXT: vmov r12, r11, d3
-; CHECK-NEXT: ldrh.w r8, [r6]
-; CHECK-NEXT: vmov r2, r6, d2
+; CHECK-NEXT: vstrw.32 q1, [r2]
+; CHECK-NEXT: mov r12, r2
+; CHECK-NEXT: vldrh.s32 q2, [r2, #8]
+; CHECK-NEXT: vadd.i16 q1, q1, r9
+; CHECK-NEXT: vshl.i32 q2, q2, #1
+; CHECK-NEXT: vadd.i32 q2, q2, r0
+; CHECK-NEXT: vmov r7, r5, d5
+; CHECK-NEXT: vmov r3, r4, d4
+; CHECK-NEXT: vldrh.s32 q2, [r2]
+; CHECK-NEXT: vshl.i32 q2, q2, #1
+; CHECK-NEXT: vadd.i32 q2, q2, r0
+; CHECK-NEXT: vmov r1, r10, d5
+; CHECK-NEXT: ldrh r7, [r7]
; CHECK-NEXT: ldrh r4, [r4]
+; CHECK-NEXT: ldrh r5, [r5]
+; CHECK-NEXT: ldrh.w r2, [r10]
+; CHECK-NEXT: ldrh.w r10, [r3]
+; CHECK-NEXT: vmov r3, r11, d4
+; CHECK-NEXT: ldrh r1, [r1]
; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: ldrh.w r1, [r11]
-; CHECK-NEXT: ldrh.w r11, [r5]
-; CHECK-NEXT: ldrh.w r5, [r12]
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r6, [r6]
-; CHECK-NEXT: vmov.16 q1[0], r2
-; CHECK-NEXT: vmov.16 q1[1], r6
-; CHECK-NEXT: vmov.16 q1[2], r5
-; CHECK-NEXT: vmov.16 q1[3], r1
-; CHECK-NEXT: vmov.16 q1[4], r11
-; CHECK-NEXT: vmov.16 q1[5], r4
-; CHECK-NEXT: vmov.16 q1[6], r3
-; CHECK-NEXT: vmov.16 q1[7], r8
-; CHECK-NEXT: vstrb.8 q1, [r7], #16
+; CHECK-NEXT: ldrh.w r11, [r11]
+; CHECK-NEXT: vmov.16 q2[0], r3
+; CHECK-NEXT: vmov.16 q2[1], r11
+; CHECK-NEXT: vmov.16 q2[2], r1
+; CHECK-NEXT: vmov.16 q2[3], r2
+; CHECK-NEXT: mov r2, r12
+; CHECK-NEXT: vmov.16 q2[4], r10
+; CHECK-NEXT: vmov.16 q2[5], r4
+; CHECK-NEXT: vmov.16 q2[6], r7
+; CHECK-NEXT: vmov.16 q2[7], r5
+; CHECK-NEXT: vstrb.8 q2, [r6], #16
; CHECK-NEXT: le lr, .LBB13_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB13_2 Depth=1
-; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: cmp r2, r1
+; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload
+; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: bne .LBB13_2
; CHECK-NEXT: .LBB13_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #32
+; CHECK-NEXT: add sp, #28
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.6:
@@ -711,144 +711,145 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #88
-; CHECK-NEXT: sub sp, #88
+; CHECK-NEXT: .pad #136
+; CHECK-NEXT: sub sp, #136
; CHECK-NEXT: cmp r2, #1
-; CHECK-NEXT: strd r1, r2, [sp, #8] @ 8-byte Folded Spill
+; CHECK-NEXT: strd r1, r2, [sp, #64] @ 8-byte Folded Spill
; CHECK-NEXT: blt.w .LBB14_5
; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: add r4, sp, #72
-; CHECK-NEXT: add r7, sp, #40
-; CHECK-NEXT: add r5, sp, #56
+; CHECK-NEXT: ldr r1, [sp, #68] @ 4-byte Reload
+; CHECK-NEXT: adr r3, .LCPI14_2
+; CHECK-NEXT: vldrw.u32 q0, [r3]
+; CHECK-NEXT: movs r2, #1
; CHECK-NEXT: bic r1, r1, #7
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: subs r1, #8
-; CHECK-NEXT: vmov.i16 q6, #0x18
-; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT: add.w r1, r3, r1, lsr #3
-; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
+; CHECK-NEXT: vstrw.32 q0, [sp, #40] @ 16-byte Spill
+; CHECK-NEXT: vmov.i16 q2, #0x18
+; CHECK-NEXT: add.w r1, r2, r1, lsr #3
+; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill
+; CHECK-NEXT: adr r1, .LCPI14_0
+; CHECK-NEXT: adr r2, .LCPI14_1
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill
+; CHECK-NEXT: vstrw.32 q0, [sp, #24] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q0, [r2]
+; CHECK-NEXT: add r2, sp, #120
+; CHECK-NEXT: vstrw.32 q0, [sp, #8] @ 16-byte Spill
; CHECK-NEXT: .LBB14_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB14_3 Depth 2
-; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload
+; CHECK-NEXT: add.w r10, sp, #104
; CHECK-NEXT: dls lr, r1
-; CHECK-NEXT: adr r1, .LCPI14_2
-; CHECK-NEXT: vldrw.u32 q1, [r1]
-; CHECK-NEXT: adr r1, .LCPI14_0
-; CHECK-NEXT: vldrw.u32 q2, [r1]
-; CHECK-NEXT: adr r1, .LCPI14_1
-; CHECK-NEXT: ldr.w r12, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: vldrw.u32 q3, [r1]
+; CHECK-NEXT: ldr r7, [sp, #64] @ 4-byte Reload
+; CHECK-NEXT: vldrw.u32 q4, [sp, #24] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q5, [sp, #40] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q6, [sp, #8] @ 16-byte Reload
; CHECK-NEXT: .LBB14_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB14_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vstrw.32 q1, [r4]
-; CHECK-NEXT: mov r1, r5
-; CHECK-NEXT: vldrh.s32 q0, [r4, #8]
-; CHECK-NEXT: mov r11, r4
-; CHECK-NEXT: mov r5, r7
+; CHECK-NEXT: vstrw.32 q5, [r2]
+; CHECK-NEXT: mov r8, r2
+; CHECK-NEXT: vldrh.s32 q0, [r2, #8]
; CHECK-NEXT: vshl.i32 q0, q0, #1
; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, r3, d0
-; CHECK-NEXT: vmov r6, r10, d1
-; CHECK-NEXT: vldrh.s32 q0, [r4]
+; CHECK-NEXT: vmov r1, r3, d0
+; CHECK-NEXT: vmov r4, r5, d1
+; CHECK-NEXT: vldrh.s32 q0, [r2]
; CHECK-NEXT: vshl.i32 q0, q0, #1
-; CHECK-NEXT: vadd.i32 q6, q0, r0
-; CHECK-NEXT: vmov r7, r4, d12
-; CHECK-NEXT: ldrh.w r9, [r2]
-; CHECK-NEXT: ldrh.w r2, [r10]
-; CHECK-NEXT: str r2, [sp, #36] @ 4-byte Spill
-; CHECK-NEXT: ldrh.w r8, [r3]
-; CHECK-NEXT: ldrh r3, [r6]
-; CHECK-NEXT: ldrh r2, [r7]
-; CHECK-NEXT: mov r7, r5
-; CHECK-NEXT: ldrh r4, [r4]
-; CHECK-NEXT: vstrw.32 q3, [r7]
-; CHECK-NEXT: vldrh.s32 q0, [r7]
-; CHECK-NEXT: vmov.16 q4[0], r2
-; CHECK-NEXT: vmov.16 q4[1], r4
-; CHECK-NEXT: mov r5, r1
+; CHECK-NEXT: vadd.i32 q2, q0, r0
+; CHECK-NEXT: vmov r6, r2, d4
+; CHECK-NEXT: ldrh r1, [r1]
+; CHECK-NEXT: ldrh.w r12, [r4]
+; CHECK-NEXT: add r4, sp, #88
+; CHECK-NEXT: ldrh.w r11, [r5]
+; CHECK-NEXT: ldrh r3, [r3]
+; CHECK-NEXT: ldrh r5, [r6]
+; CHECK-NEXT: ldrh r2, [r2]
+; CHECK-NEXT: vstrw.32 q6, [r4]
+; CHECK-NEXT: vldrh.s32 q0, [r4]
+; CHECK-NEXT: vmov.16 q7[0], r5
+; CHECK-NEXT: vmov.16 q7[1], r2
; CHECK-NEXT: vshl.i32 q0, q0, #1
; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r4, r6, d0
-; CHECK-NEXT: vmov r1, r2, d1
-; CHECK-NEXT: vldrh.s32 q0, [r7, #8]
+; CHECK-NEXT: vmov r6, r9, d0
+; CHECK-NEXT: vmov r2, r5, d1
+; CHECK-NEXT: vldrh.s32 q0, [r4, #8]
; CHECK-NEXT: vshl.i32 q0, q0, #1
; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: ldrh r4, [r4]
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.16 q5[0], r4
-; CHECK-NEXT: ldrh r4, [r6]
+; CHECK-NEXT: ldrh r6, [r6]
; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: vmov.16 q5[1], r4
-; CHECK-NEXT: vmov.16 q5[2], r1
-; CHECK-NEXT: vmov r1, r4, d0
-; CHECK-NEXT: vmov.16 q5[3], r2
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: ldrh r4, [r4]
-; CHECK-NEXT: vmov.16 q5[4], r1
-; CHECK-NEXT: vmov r1, r2, d1
-; CHECK-NEXT: vmov.16 q5[5], r4
-; CHECK-NEXT: ldrh r1, [r1]
+; CHECK-NEXT: vmov.16 q1[0], r6
+; CHECK-NEXT: ldrh.w r6, [r9]
+; CHECK-NEXT: ldrh r5, [r5]
+; CHECK-NEXT: vmov.16 q1[1], r6
+; CHECK-NEXT: vmov.16 q1[2], r2
+; CHECK-NEXT: vmov r2, r6, d0
+; CHECK-NEXT: vmov.16 q1[3], r5
; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: vstrw.32 q2, [r5]
-; CHECK-NEXT: vldrh.s32 q0, [r5]
-; CHECK-NEXT: vmov.16 q5[6], r1
-; CHECK-NEXT: vmov.16 q5[7], r2
+; CHECK-NEXT: ldrh r6, [r6]
+; CHECK-NEXT: vmov.16 q1[4], r2
+; CHECK-NEXT: vmov r2, r5, d1
+; CHECK-NEXT: vmov.16 q1[5], r6
+; CHECK-NEXT: mov r6, r10
+; CHECK-NEXT: ldrh r2, [r2]
+; CHECK-NEXT: ldrh r5, [r5]
+; CHECK-NEXT: vstrw.32 q4, [r10]
+; CHECK-NEXT: vldrh.s32 q0, [r6]
+; CHECK-NEXT: vmov.16 q1[6], r2
+; CHECK-NEXT: vmov.16 q1[7], r5
; CHECK-NEXT: vshl.i32 q0, q0, #1
; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r1, r2, d0
-; CHECK-NEXT: ldrh r1, [r1]
+; CHECK-NEXT: vmov r2, r5, d0
; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: vmov.16 q7[0], r1
-; CHECK-NEXT: vmov.16 q7[1], r2
-; CHECK-NEXT: vmov r1, r2, d13
-; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vadd.i16 q3, q3, q6
-; CHECK-NEXT: vadd.i16 q1, q1, q6
-; CHECK-NEXT: vadd.i16 q2, q2, q6
-; CHECK-NEXT: ldrh.w r10, [r2]
+; CHECK-NEXT: ldrh r5, [r5]
+; CHECK-NEXT: vmov.16 q3[0], r2
+; CHECK-NEXT: vmov.16 q3[1], r5
+; CHECK-NEXT: vmov r2, r5, d5
+; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload
+; CHECK-NEXT: vadd.i16 q6, q6, q2
+; CHECK-NEXT: vadd.i16 q5, q5, q2
+; CHECK-NEXT: vadd.i16 q4, q4, q2
+; CHECK-NEXT: ldrh.w r9, [r2]
; CHECK-NEXT: vmov r2, r4, d1
-; CHECK-NEXT: vldrh.s32 q0, [r5, #8]
-; CHECK-NEXT: ldrh r1, [r1]
+; CHECK-NEXT: vldrh.s32 q0, [r6, #8]
+; CHECK-NEXT: ldrh r5, [r5]
+; CHECK-NEXT: vmov.16 q7[2], r9
; CHECK-NEXT: vshl.i32 q0, q0, #1
-; CHECK-NEXT: vmov.16 q4[2], r1
+; CHECK-NEXT: vmov.16 q7[3], r5
; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov.16 q4[3], r10
-; CHECK-NEXT: vmov.16 q4[4], r9
-; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
-; CHECK-NEXT: vmov.16 q4[5], r8
-; CHECK-NEXT: vmov.16 q4[6], r3
-; CHECK-NEXT: vmov.16 q4[7], r1
+; CHECK-NEXT: vmov.16 q7[4], r1
+; CHECK-NEXT: vmov.16 q7[5], r3
+; CHECK-NEXT: vmov.16 q7[6], r12
+; CHECK-NEXT: vmov.16 q7[7], r11
; CHECK-NEXT: ldrh r2, [r2]
; CHECK-NEXT: ldrh r4, [r4]
-; CHECK-NEXT: vmov.16 q7[2], r2
-; CHECK-NEXT: vmov.16 q7[3], r4
+; CHECK-NEXT: vmov.16 q3[2], r2
+; CHECK-NEXT: vmov.16 q3[3], r4
; CHECK-NEXT: vmov r2, r4, d0
; CHECK-NEXT: ldrh r2, [r2]
; CHECK-NEXT: ldrh r4, [r4]
-; CHECK-NEXT: vmov.16 q7[4], r2
-; CHECK-NEXT: vmov.16 q7[5], r4
+; CHECK-NEXT: vmov.16 q3[4], r2
+; CHECK-NEXT: vmov.16 q3[5], r4
; CHECK-NEXT: vmov r2, r4, d1
; CHECK-NEXT: ldrh r2, [r2]
; CHECK-NEXT: ldrh r4, [r4]
-; CHECK-NEXT: vmov.16 q7[6], r2
-; CHECK-NEXT: vmov.16 q7[7], r4
-; CHECK-NEXT: mov r4, r11
-; CHECK-NEXT: vadd.i16 q0, q7, q5
-; CHECK-NEXT: vadd.i16 q0, q0, q4
-; CHECK-NEXT: vstrb.8 q0, [r12], #16
+; CHECK-NEXT: vmov.16 q3[6], r2
+; CHECK-NEXT: mov r2, r8
+; CHECK-NEXT: vmov.16 q3[7], r4
+; CHECK-NEXT: vadd.i16 q0, q3, q1
+; CHECK-NEXT: vadd.i16 q0, q0, q7
+; CHECK-NEXT: vstrb.8 q0, [r7], #16
; CHECK-NEXT: le lr, .LBB14_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB14_2 Depth=1
; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: cmp r1, r2
+; CHECK-NEXT: ldr r3, [sp, #68] @ 4-byte Reload
+; CHECK-NEXT: cmp r1, r3
; CHECK-NEXT: bne.w .LBB14_2
; CHECK-NEXT: .LBB14_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #88
+; CHECK-NEXT: add sp, #136
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -924,246 +925,260 @@ for.cond.cleanup: ; preds = %for.body, %middle.b
define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) {
; CHECK-LABEL: gather_inc_v16i8_complex:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: cmp r2, #1
-; CHECK-NEXT: it lt
-; CHECK-NEXT: bxlt lr
-; CHECK-NEXT: .LBB15_1: @ %vector.ph.preheader
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #160
-; CHECK-NEXT: sub sp, #160
-; CHECK-NEXT: bic lr, r2, #7
-; CHECK-NEXT: mov r12, r1
-; CHECK-NEXT: vmov.i32 q0, #0x30
-; CHECK-NEXT: .LBB15_2: @ %vector.ph
-; CHECK-NEXT: @ =>This Loop Header: Depth=1
-; CHECK-NEXT: @ Child Loop BB15_3 Depth 2
+; CHECK-NEXT: .pad #312
+; CHECK-NEXT: sub sp, #312
+; CHECK-NEXT: cmp r2, #1
+; CHECK-NEXT: str r1, [sp, #116] @ 4-byte Spill
+; CHECK-NEXT: blt.w .LBB15_5
+; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader
; CHECK-NEXT: adr r1, .LCPI15_0
-; CHECK-NEXT: mov r8, r12
-; CHECK-NEXT: vldrw.u32 q2, [r1]
+; CHECK-NEXT: adr r6, .LCPI15_8
+; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: adr r1, .LCPI15_1
-; CHECK-NEXT: vldrw.u32 q1, [r1]
-; CHECK-NEXT: adr r1, .LCPI15_8
-; CHECK-NEXT: vldrw.u32 q4, [r1]
-; CHECK-NEXT: adr r1, .LCPI15_7
-; CHECK-NEXT: vldrw.u32 q5, [r1]
-; CHECK-NEXT: adr r1, .LCPI15_9
-; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: adr r7, .LCPI15_7
+; CHECK-NEXT: adr r3, .LCPI15_6
+; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: adr r1, .LCPI15_5
-; CHECK-NEXT: mov r9, lr
-; CHECK-NEXT: vstrw.32 q1, [sp, #144] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q1, [r1]
-; CHECK-NEXT: adr r1, .LCPI15_6
-; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: bic r10, r2, #7
+; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q0, [r6]
+; CHECK-NEXT: adr r6, .LCPI15_9
+; CHECK-NEXT: vmov.i32 q2, #0x30
+; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q0, [r7]
+; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q0, [r6]
+; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q0, [r3]
+; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT: .LBB15_2: @ %vector.ph
+; CHECK-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-NEXT: @ Child Loop BB15_3 Depth 2
; CHECK-NEXT: adr r1, .LCPI15_3
-; CHECK-NEXT: vldrw.u32 q6, [r1]
+; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: adr r1, .LCPI15_4
-; CHECK-NEXT: vldrw.u32 q7, [r1]
+; CHECK-NEXT: vldrw.u32 q5, [r1]
; CHECK-NEXT: adr r1, .LCPI15_2
-; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q7, [r1]
+; CHECK-NEXT: vldrw.u32 q3, [r1]
; CHECK-NEXT: adr r1, .LCPI15_10
-; CHECK-NEXT: vstrw.32 q7, [sp, #112] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q7, [r1]
-; CHECK-NEXT: adr r1, .LCPI15_11
+; CHECK-NEXT: vstrw.32 q6, [sp, #280] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q3, [r1]
-; CHECK-NEXT: vstrw.32 q7, [sp, #96] @ 16-byte Spill
+; CHECK-NEXT: adr r1, .LCPI15_11
+; CHECK-NEXT: ldr.w r8, [sp, #116] @ 4-byte Reload
+; CHECK-NEXT: vstrw.32 q3, [sp, #248] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q6, [sp, #264] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q7, [r1]
+; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q3, [sp, #200] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT: mov r11, r10
+; CHECK-NEXT: vstrw.32 q6, [sp, #232] @ 16-byte Spill
+; CHECK-NEXT: vstrw.32 q3, [sp, #184] @ 16-byte Spill
; CHECK-NEXT: .LBB15_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB15_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vadd.i32 q7, q6, r0
-; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT: vadd.i32 q6, q3, r0
-; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill
-; CHECK-NEXT: vadd.i32 q3, q1, r0
-; CHECK-NEXT: vmov r10, r1, d15
-; CHECK-NEXT: vmov r7, r11, d6
-; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT: vmov r5, r3, d13
-; CHECK-NEXT: vldrw.u32 q2, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT: vmov q1, q0
-; CHECK-NEXT: vadd.i32 q2, q2, r0
-; CHECK-NEXT: vmov q0, q1
-; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT: subs.w r9, r9, #16
+; CHECK-NEXT: vadd.i32 q4, q1, r0
+; CHECK-NEXT: vstrw.32 q7, [sp, #136] @ 16-byte Spill
+; CHECK-NEXT: vmov r1, lr, d8
+; CHECK-NEXT: vadd.i32 q7, q7, r0
+; CHECK-NEXT: vmov r5, r4, d15
+; CHECK-NEXT: vadd.i32 q6, q0, r0
+; CHECK-NEXT: vmov r6, r7, d13
+; CHECK-NEXT: vstrw.32 q1, [sp, #152] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q1, [sp, #296] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q0, [sp, #168] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q0, [sp, #248] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q3, [sp, #216] @ 16-byte Reload
; CHECK-NEXT: vadd.i32 q1, q1, r0
-; CHECK-NEXT: ldrb r6, [r1]
-; CHECK-NEXT: ldrb r1, [r7]
-; CHECK-NEXT: vmov r7, r4, d12
+; CHECK-NEXT: vstrw.32 q5, [sp, #120] @ 16-byte Spill
+; CHECK-NEXT: vadd.i32 q0, q0, r0
+; CHECK-NEXT: subs.w r11, r11, #16
+; CHECK-NEXT: ldrb.w r9, [r1]
+; CHECK-NEXT: vmov r1, r3, d14
; CHECK-NEXT: ldrb r5, [r5]
-; CHECK-NEXT: ldrb r3, [r3]
; CHECK-NEXT: ldrb r7, [r7]
-; CHECK-NEXT: ldrb r4, [r4]
-; CHECK-NEXT: vmov.8 q6[0], r7
-; CHECK-NEXT: vmov.8 q6[1], r4
-; CHECK-NEXT: vmov.8 q6[2], r5
-; CHECK-NEXT: vmov r4, r5, d14
-; CHECK-NEXT: vmov.8 q6[3], r3
-; CHECK-NEXT: ldrb r4, [r4]
-; CHECK-NEXT: ldrb r7, [r5]
-; CHECK-NEXT: vmov.8 q7[0], r4
-; CHECK-NEXT: ldrb.w r5, [r10]
-; CHECK-NEXT: vmov.8 q7[1], r7
-; CHECK-NEXT: ldrb.w r7, [r11]
+; CHECK-NEXT: ldrb r1, [r1]
+; CHECK-NEXT: vmov.8 q7[0], r1
+; CHECK-NEXT: ldrb r1, [r3]
+; CHECK-NEXT: vmov.8 q7[1], r1
+; CHECK-NEXT: vmov r1, r3, d12
; CHECK-NEXT: vmov.8 q7[2], r5
-; CHECK-NEXT: vmov r5, r10, d5
+; CHECK-NEXT: ldrb r5, [r6]
+; CHECK-NEXT: ldrb r6, [r4]
; CHECK-NEXT: vmov.8 q7[3], r6
-; CHECK-NEXT: vmov r3, r4, d4
-; CHECK-NEXT: vmov.8 q7[4], r1
-; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT: vmov.8 q7[5], r7
-; CHECK-NEXT: ldrb r6, [r5]
-; CHECK-NEXT: vmov r1, r5, d7
-; CHECK-NEXT: ldrb r4, [r4]
-; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q3, q3, q0
; CHECK-NEXT: ldrb r1, [r1]
+; CHECK-NEXT: ldrb r3, [r3]
+; CHECK-NEXT: vmov.8 q6[0], r1
+; CHECK-NEXT: vmov r6, r1, d2
+; CHECK-NEXT: vmov.8 q6[1], r3
+; CHECK-NEXT: vmov.8 q6[2], r5
+; CHECK-NEXT: vmov.8 q6[3], r7
+; CHECK-NEXT: ldrb.w r7, [lr]
+; CHECK-NEXT: vmov.8 q6[4], r9
+; CHECK-NEXT: vmov.8 q6[5], r7
+; CHECK-NEXT: ldrb r4, [r1]
+; CHECK-NEXT: vmov r1, r5, d3
+; CHECK-NEXT: vldrw.u32 q1, [sp, #232] @ 16-byte Reload
+; CHECK-NEXT: ldrb.w r12, [r1]
+; CHECK-NEXT: vmov r1, r3, d9
; CHECK-NEXT: ldrb r5, [r5]
-; CHECK-NEXT: vmov.8 q7[6], r1
-; CHECK-NEXT: vmov r1, r7, d2
-; CHECK-NEXT: vmov.8 q7[7], r5
+; CHECK-NEXT: vldrw.u32 q4, [sp, #184] @ 16-byte Reload
+; CHECK-NEXT: ldrb r1, [r1]
+; CHECK-NEXT: ldrb r3, [r3]
+; CHECK-NEXT: vmov.8 q6[6], r1
+; CHECK-NEXT: vmov r1, r7, d0
+; CHECK-NEXT: vmov.8 q6[7], r3
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: ldrb r7, [r7]
-; CHECK-NEXT: vmov.8 q6[4], r1
-; CHECK-NEXT: vmov r1, r5, d3
-; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT: vmov.8 q6[5], r7
-; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vmov.8 q7[4], r1
+; CHECK-NEXT: vmov r1, r3, d1
+; CHECK-NEXT: vldrw.u32 q0, [sp, #264] @ 16-byte Reload
+; CHECK-NEXT: vmov.8 q7[5], r7
+; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: ldrb r5, [r5]
-; CHECK-NEXT: vmov.8 q6[6], r1
-; CHECK-NEXT: ldrb r1, [r3]
-; CHECK-NEXT: vmov.8 q6[7], r5
-; CHECK-NEXT: vmov r3, r7, d2
-; CHECK-NEXT: vmov.8 q6[8], r1
-; CHECK-NEXT: vmov r1, r11, d3
-; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT: vmov.8 q6[9], r4
-; CHECK-NEXT: vmov.8 q6[10], r6
-; CHECK-NEXT: vadd.i32 q1, q1, r0
-; CHECK-NEXT: vmov r5, r6, d2
-; CHECK-NEXT: ldrb r4, [r7]
-; CHECK-NEXT: ldrb.w r7, [r10]
; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.8 q6[11], r7
-; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: ldrb r5, [r5]
+; CHECK-NEXT: vmov.8 q7[6], r1
+; CHECK-NEXT: ldrb r1, [r6]
+; CHECK-NEXT: vmov r7, r6, d0
+; CHECK-NEXT: vmov.8 q7[7], r3
+; CHECK-NEXT: vmov r3, lr, d1
+; CHECK-NEXT: vldrw.u32 q0, [sp, #280] @ 16-byte Reload
+; CHECK-NEXT: vmov.8 q7[8], r1
+; CHECK-NEXT: vadd.i32 q0, q0, r0
+; CHECK-NEXT: vmov.8 q7[9], r4
+; CHECK-NEXT: vmov r4, r1, d0
+; CHECK-NEXT: vmov.8 q7[10], r12
+; CHECK-NEXT: vmov.8 q7[11], r5
+; CHECK-NEXT: ldrb r7, [r7]
; CHECK-NEXT: ldrb r6, [r6]
-; CHECK-NEXT: vmov.8 q7[8], r5
-; CHECK-NEXT: vmov r5, r7, d3
-; CHECK-NEXT: vmov.8 q7[9], r6
-; CHECK-NEXT: vadd.i32 q1, q2, r0
-; CHECK-NEXT: vadd.i32 q2, q2, q0
-; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q2, q2, q0
+; CHECK-NEXT: ldrb r3, [r3]
+; CHECK-NEXT: ldrb r4, [r4]
+; CHECK-NEXT: ldrb r1, [r1]
+; CHECK-NEXT: vmov.8 q6[8], r4
+; CHECK-NEXT: vmov r5, r4, d1
+; CHECK-NEXT: vmov.8 q6[9], r1
+; CHECK-NEXT: vadd.i32 q0, q5, r0
+; CHECK-NEXT: vldrw.u32 q5, [sp, #200] @ 16-byte Reload
; CHECK-NEXT: ldrb r5, [r5]
-; CHECK-NEXT: ldrb r7, [r7]
-; CHECK-NEXT: vmov.8 q7[10], r5
-; CHECK-NEXT: vmov.8 q7[11], r7
-; CHECK-NEXT: vmov.8 q7[12], r3
-; CHECK-NEXT: vmov.8 q7[13], r4
-; CHECK-NEXT: vmov.8 q7[14], r1
-; CHECK-NEXT: vmov r1, r3, d2
+; CHECK-NEXT: ldrb r4, [r4]
+; CHECK-NEXT: vmov.8 q6[10], r5
+; CHECK-NEXT: vmov.8 q6[11], r4
+; CHECK-NEXT: vmov.8 q6[12], r7
+; CHECK-NEXT: vmov.8 q6[13], r6
+; CHECK-NEXT: vmov.8 q6[14], r3
+; CHECK-NEXT: vmov r1, r3, d0
; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.8 q6[12], r1
+; CHECK-NEXT: vmov.8 q7[12], r1
; CHECK-NEXT: ldrb r1, [r3]
-; CHECK-NEXT: vmov.8 q6[13], r1
-; CHECK-NEXT: vmov r1, r3, d3
-; CHECK-NEXT: vadd.i32 q1, q5, r0
-; CHECK-NEXT: vadd.i32 q5, q5, q0
+; CHECK-NEXT: vmov.8 q7[13], r1
+; CHECK-NEXT: vmov r1, r3, d1
+; CHECK-NEXT: vadd.i32 q0, q1, r0
+; CHECK-NEXT: vadd.i32 q1, q1, q2
+; CHECK-NEXT: vstrw.32 q1, [sp, #232] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q1, [sp, #248] @ 16-byte Reload
+; CHECK-NEXT: vadd.i32 q1, q1, q2
+; CHECK-NEXT: vstrw.32 q1, [sp, #248] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q1, [sp, #152] @ 16-byte Reload
+; CHECK-NEXT: vadd.i32 q1, q1, q2
; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.8 q6[14], r1
+; CHECK-NEXT: vmov.8 q7[14], r1
; CHECK-NEXT: ldrb r1, [r3]
-; CHECK-NEXT: vmov.8 q6[15], r1
-; CHECK-NEXT: ldrb.w r1, [r11]
; CHECK-NEXT: vmov.8 q7[15], r1
-; CHECK-NEXT: vmov r1, r3, d2
-; CHECK-NEXT: vadd.i8 q6, q7, q6
+; CHECK-NEXT: ldrb.w r1, [lr]
+; CHECK-NEXT: vmov.8 q6[15], r1
+; CHECK-NEXT: vmov r1, r3, d0
+; CHECK-NEXT: vadd.i8 q6, q6, q7
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: ldrb r3, [r3]
; CHECK-NEXT: vmov.8 q7[0], r1
; CHECK-NEXT: vmov.8 q7[1], r3
-; CHECK-NEXT: vmov r1, r3, d3
-; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vmov r1, r3, d1
+; CHECK-NEXT: vadd.i32 q0, q3, r0
+; CHECK-NEXT: vadd.i32 q3, q3, q2
+; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q3, [sp, #296] @ 16-byte Reload
+; CHECK-NEXT: vadd.i32 q3, q3, q2
+; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q3, [sp, #280] @ 16-byte Reload
+; CHECK-NEXT: vadd.i32 q3, q3, q2
+; CHECK-NEXT: vstrw.32 q3, [sp, #280] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q3, [sp, #264] @ 16-byte Reload
+; CHECK-NEXT: vadd.i32 q3, q3, q2
+; CHECK-NEXT: vstrw.32 q3, [sp, #264] @ 16-byte Spill
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[2], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[3], r1
-; CHECK-NEXT: vmov r1, r3, d2
+; CHECK-NEXT: vmov r1, r3, d0
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[4], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[5], r1
-; CHECK-NEXT: vmov r1, r3, d3
-; CHECK-NEXT: vadd.i32 q1, q4, r0
-; CHECK-NEXT: vadd.i32 q4, q4, q0
+; CHECK-NEXT: vmov r1, r3, d1
+; CHECK-NEXT: vadd.i32 q0, q5, r0
+; CHECK-NEXT: vadd.i32 q5, q5, q2
+; CHECK-NEXT: vstrw.32 q5, [sp, #200] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q5, [sp, #120] @ 16-byte Reload
+; CHECK-NEXT: vadd.i32 q5, q5, q2
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[6], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[7], r1
-; CHECK-NEXT: vmov r1, r3, d2
+; CHECK-NEXT: vmov r1, r3, d0
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[8], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[9], r1
-; CHECK-NEXT: vmov r1, r3, d3
-; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vmov r1, r3, d1
+; CHECK-NEXT: vadd.i32 q0, q4, r0
+; CHECK-NEXT: vadd.i32 q4, q4, q2
+; CHECK-NEXT: vstrw.32 q4, [sp, #184] @ 16-byte Spill
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[10], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[11], r1
-; CHECK-NEXT: vmov r1, r3, d2
+; CHECK-NEXT: vmov r1, r3, d0
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[12], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[13], r1
-; CHECK-NEXT: vmov r1, r3, d3
+; CHECK-NEXT: vmov r1, r3, d1
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[14], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[15], r1
-; CHECK-NEXT: vadd.i8 q1, q6, q7
-; CHECK-NEXT: vldrw.u32 q7, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vstrb.8 q1, [r8], #16
-; CHECK-NEXT: vadd.i32 q7, q7, q0
-; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q7, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q7, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q6, q6, q0
-; CHECK-NEXT: vadd.i32 q1, q1, q0
-; CHECK-NEXT: vadd.i32 q7, q7, q0
-; CHECK-NEXT: vstrw.32 q7, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q7, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q7, q7, q0
-; CHECK-NEXT: vstrw.32 q7, [sp, #112] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q7, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q7, q7, q0
-; CHECK-NEXT: vstrw.32 q7, [sp, #128] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q7, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q7, q7, q0
-; CHECK-NEXT: vstrw.32 q7, [sp, #144] @ 16-byte Spill
+; CHECK-NEXT: vadd.i8 q0, q6, q7
+; CHECK-NEXT: vldrw.u32 q7, [sp, #136] @ 16-byte Reload
+; CHECK-NEXT: vstrb.8 q0, [r8], #16
+; CHECK-NEXT: vldrw.u32 q0, [sp, #168] @ 16-byte Reload
+; CHECK-NEXT: vadd.i32 q7, q7, q2
+; CHECK-NEXT: vadd.i32 q0, q0, q2
; CHECK-NEXT: bne.w .LBB15_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB15_2 Depth=1
-; CHECK-NEXT: cmp lr, r2
+; CHECK-NEXT: cmp r10, r2
; CHECK-NEXT: bne.w .LBB15_2
-; CHECK-NEXT: @ %bb.5:
-; CHECK-NEXT: add sp, #160
+; CHECK-NEXT: .LBB15_5: @ %for.cond.cleanup
+; CHECK-NEXT: add sp, #312
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
-; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT: bx lr
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.6:
; CHECK-NEXT: .LCPI15_0:
@@ -1272,95 +1287,102 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: .pad #64
+; CHECK-NEXT: sub sp, #64
; CHECK-NEXT: cmp r2, #1
-; CHECK-NEXT: strd r1, r2, [sp, #8] @ 8-byte Folded Spill
-; CHECK-NEXT: blt .LBB16_5
+; CHECK-NEXT: strd r1, r2, [sp, #56] @ 8-byte Folded Spill
+; CHECK-NEXT: blt.w .LBB16_5
; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: mov.w r11, #16
-; CHECK-NEXT: bic r3, r1, #7
-; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: adr r5, .LCPI16_3
+; CHECK-NEXT: adr r7, .LCPI16_1
+; CHECK-NEXT: vldrw.u32 q0, [r5]
+; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload
+; CHECK-NEXT: adr r3, .LCPI16_0
+; CHECK-NEXT: adr r6, .LCPI16_2
+; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q0, [r7]
+; CHECK-NEXT: bic r9, r1, #7
+; CHECK-NEXT: vldrw.u32 q3, [r3]
+; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q0, [r6]
+; CHECK-NEXT: mov.w lr, #16
+; CHECK-NEXT: str.w r9, [sp, #52] @ 4-byte Spill
+; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
; CHECK-NEXT: .LBB16_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB16_3 Depth 2
-; CHECK-NEXT: adr r1, .LCPI16_3
-; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: vldrw.u32 q5, [r1]
-; CHECK-NEXT: adr r1, .LCPI16_1
-; CHECK-NEXT: vldrw.u32 q4, [r1]
-; CHECK-NEXT: adr r1, .LCPI16_2
-; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: adr r1, .LCPI16_0
-; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: ldr.w r8, [sp, #56] @ 4-byte Reload
+; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vmov q4, q3
; CHECK-NEXT: .LBB16_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB16_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vadd.i32 q6, q1, r0
-; CHECK-NEXT: vadd.i32 q2, q0, r0
-; CHECK-NEXT: vmov r4, r5, d13
-; CHECK-NEXT: vadd.i32 q3, q5, r11
-; CHECK-NEXT: vmov lr, r8, d4
-; CHECK-NEXT: subs r3, #16
-; CHECK-NEXT: vmov r6, r12, d5
-; CHECK-NEXT: vadd.i32 q2, q4, r11
-; CHECK-NEXT: vadd.i32 q1, q1, r11
-; CHECK-NEXT: vadd.i32 q0, q0, r11
-; CHECK-NEXT: ldrb.w r10, [r5]
-; CHECK-NEXT: vmov r2, r5, d12
-; CHECK-NEXT: vadd.i32 q6, q5, r0
-; CHECK-NEXT: vadd.i32 q5, q4, r0
-; CHECK-NEXT: ldrb.w r1, [r8]
-; CHECK-NEXT: ldrb.w r9, [r4]
-; CHECK-NEXT: ldrb r4, [r6]
-; CHECK-NEXT: ldrb.w r6, [lr]
-; CHECK-NEXT: ldrb.w r12, [r12]
-; CHECK-NEXT: ldrb r2, [r2]
+; CHECK-NEXT: vadd.i32 q1, q5, r0
+; CHECK-NEXT: vadd.i32 q2, q4, r0
+; CHECK-NEXT: vmov r7, r3, d3
+; CHECK-NEXT: vadd.i32 q6, q0, lr
+; CHECK-NEXT: vmov r5, r6, d5
+; CHECK-NEXT: subs.w r9, r9, #16
+; CHECK-NEXT: vmov r4, r10, d2
+; CHECK-NEXT: vadd.i32 q1, q7, lr
+; CHECK-NEXT: vadd.i32 q4, q4, lr
+; CHECK-NEXT: vadd.i32 q5, q5, lr
+; CHECK-NEXT: ldrb.w r11, [r3]
+; CHECK-NEXT: ldrb r3, [r7]
+; CHECK-NEXT: vmov r7, r12, d4
+; CHECK-NEXT: vadd.i32 q2, q7, r0
+; CHECK-NEXT: vadd.i32 q7, q0, r0
; CHECK-NEXT: ldrb r5, [r5]
-; CHECK-NEXT: vmov.8 q4[0], r2
-; CHECK-NEXT: vmov.8 q4[1], r5
-; CHECK-NEXT: vmov r8, r5, d11
-; CHECK-NEXT: vmov.8 q4[2], r9
-; CHECK-NEXT: vmov.8 q4[3], r10
-; CHECK-NEXT: vmov.8 q4[4], r6
-; CHECK-NEXT: vmov.8 q4[5], r1
-; CHECK-NEXT: vmov.8 q4[6], r4
-; CHECK-NEXT: vmov r4, r6, d10
-; CHECK-NEXT: vmov.8 q4[7], r12
-; CHECK-NEXT: vmov q5, q3
-; CHECK-NEXT: ldrb.w lr, [r5]
-; CHECK-NEXT: vmov r5, r2, d13
-; CHECK-NEXT: ldrb r4, [r4]
; CHECK-NEXT: ldrb r6, [r6]
-; CHECK-NEXT: vmov.8 q4[8], r4
-; CHECK-NEXT: vmov.8 q4[9], r6
-; CHECK-NEXT: ldrb.w r9, [r2]
-; CHECK-NEXT: vmov r1, r2, d12
-; CHECK-NEXT: ldrb r5, [r5]
-; CHECK-NEXT: ldrb.w r10, [r2]
-; CHECK-NEXT: ldrb.w r2, [r8]
+; CHECK-NEXT: ldrb r4, [r4]
+; CHECK-NEXT: ldrb.w r10, [r10]
+; CHECK-NEXT: ldrb r7, [r7]
+; CHECK-NEXT: ldrb.w r1, [r12]
+; CHECK-NEXT: vmov.8 q0[0], r7
+; CHECK-NEXT: vmov.8 q0[1], r1
+; CHECK-NEXT: vmov r1, r7, d15
+; CHECK-NEXT: vmov.8 q0[2], r5
+; CHECK-NEXT: vmov.8 q0[3], r6
+; CHECK-NEXT: vmov.8 q0[4], r4
+; CHECK-NEXT: vmov r4, r2, d4
+; CHECK-NEXT: vmov.8 q0[5], r10
+; CHECK-NEXT: vmov.8 q0[6], r3
+; CHECK-NEXT: vmov.8 q0[7], r11
+; CHECK-NEXT: ldrb r6, [r7]
+; CHECK-NEXT: vmov r5, r7, d5
; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.8 q4[10], r2
-; CHECK-NEXT: vmov.8 q4[11], lr
-; CHECK-NEXT: vmov.8 q4[12], r1
-; CHECK-NEXT: vmov.8 q4[13], r10
-; CHECK-NEXT: vmov.8 q4[14], r5
-; CHECK-NEXT: vmov.8 q4[15], r9
-; CHECK-NEXT: vstrb.8 q4, [r7], #16
-; CHECK-NEXT: vmov q4, q2
+; CHECK-NEXT: ldrb r2, [r2]
+; CHECK-NEXT: ldrb r3, [r5]
+; CHECK-NEXT: ldrb.w r12, [r7]
+; CHECK-NEXT: ldrb r5, [r4]
+; CHECK-NEXT: vmov r4, r7, d14
+; CHECK-NEXT: vmov q7, q1
+; CHECK-NEXT: ldrb r4, [r4]
+; CHECK-NEXT: ldrb r7, [r7]
+; CHECK-NEXT: vmov.8 q0[8], r4
+; CHECK-NEXT: vmov.8 q0[9], r7
+; CHECK-NEXT: vmov.8 q0[10], r1
+; CHECK-NEXT: vmov.8 q0[11], r6
+; CHECK-NEXT: vmov.8 q0[12], r5
+; CHECK-NEXT: vmov.8 q0[13], r2
+; CHECK-NEXT: vmov.8 q0[14], r3
+; CHECK-NEXT: vmov.8 q0[15], r12
+; CHECK-NEXT: vstrb.8 q0, [r8], #16
+; CHECK-NEXT: vmov q0, q6
; CHECK-NEXT: bne .LBB16_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB16_2 Depth=1
-; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: cmp r3, r1
+; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload
+; CHECK-NEXT: ldr.w r9, [sp, #52] @ 4-byte Reload
+; CHECK-NEXT: cmp r9, r1
; CHECK-NEXT: bne .LBB16_2
; CHECK-NEXT: .LBB16_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #16
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: add sp, #64
+; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: .p2align 4
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
index 82ec62ec9f7a13..7b8b884576d13e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -602,57 +602,60 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: .pad #24
-; CHECK-NEXT: sub sp, #24
-; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: .pad #32
+; CHECK-NEXT: sub sp, #32
+; CHECK-NEXT: strd r0, r2, [sp, #24] @ 8-byte Folded Spill
; CHECK-NEXT: cmp r3, #0
-; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrne r0, [sp, #112]
+; CHECK-NEXT: ldrne r0, [sp, #136]
; CHECK-NEXT: cmpne r0, #0
; CHECK-NEXT: bne .LBB10_2
; CHECK-NEXT: .LBB10_1: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #24
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: add sp, #32
+; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: .LBB10_2: @ %for.cond1.preheader.us.preheader
-; CHECK-NEXT: ldr.w r12, [sp, #116]
+; CHECK-NEXT: ldr.w r12, [sp, #140]
; CHECK-NEXT: movs r7, #1
+; CHECK-NEXT: mov.w r11, #0
; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: movs r5, #0
-; CHECK-NEXT: bic r0, r12, #3
-; CHECK-NEXT: subs r3, r0, #4
-; CHECK-NEXT: add.w r3, r7, r3, lsr #2
-; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT: ldr r3, [sp, #112]
-; CHECK-NEXT: lsl.w r7, r12, #1
-; CHECK-NEXT: str r7, [sp] @ 4-byte Spill
-; CHECK-NEXT: movs r7, #0
-; CHECK-NEXT: vdup.32 q1, r3
-; CHECK-NEXT: lsls r6, r3, #1
-; CHECK-NEXT: vshl.i32 q2, q1, #2
-; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT: str r3, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: bic r2, r12, #3
+; CHECK-NEXT: subs r3, r2, #4
+; CHECK-NEXT: add.w r0, r7, r3, lsr #2
+; CHECK-NEXT: ldr r7, [sp, #136]
+; CHECK-NEXT: adr r3, .LCPI10_0
+; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: lsl.w r0, r12, #1
+; CHECK-NEXT: vdup.32 q1, r7
+; CHECK-NEXT: vldrw.u32 q2, [r3]
+; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: lsls r6, r7, #1
+; CHECK-NEXT: vshl.i32 q3, q1, #2
+; CHECK-NEXT: movs r3, #0
+; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: b .LBB10_5
; CHECK-NEXT: .LBB10_3: @ %for.cond5.preheader.us73.preheader
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
-; CHECK-NEXT: add.w r3, r2, r8, lsl #1
+; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: add.w r3, r0, r5, lsl #1
; CHECK-NEXT: wlstp.8 lr, r6, .LBB10_4
; CHECK-NEXT: b .LBB10_15
; CHECK-NEXT: .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
-; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload
-; CHECK-NEXT: add r7, r12
-; CHECK-NEXT: ldr r5, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT: add r5, r3
-; CHECK-NEXT: str r5, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: adds r5, #1
-; CHECK-NEXT: cmp r5, r3
+; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: add r11, r12
+; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: add r3, r0
+; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: adds r3, #1
+; CHECK-NEXT: cmp r3, r0
; CHECK-NEXT: beq .LBB10_1
; CHECK-NEXT: .LBB10_5: @ %for.cond1.preheader.us
; CHECK-NEXT: @ =>This Loop Header: Depth=1
@@ -660,22 +663,21 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: @ Child Loop BB10_11 Depth 3
; CHECK-NEXT: @ Child Loop BB10_14 Depth 3
; CHECK-NEXT: @ Child Loop BB10_15 Depth 2
-; CHECK-NEXT: ldr r3, [sp, #112]
+; CHECK-NEXT: mul r5, r3, r7
; CHECK-NEXT: cmp.w r12, #0
-; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT: mul r8, r5, r3
+; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: beq .LBB10_3
; CHECK-NEXT: @ %bb.6: @ %for.cond5.preheader.us.us.preheader
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
-; CHECK-NEXT: mov.w r9, #0
+; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: b .LBB10_8
; CHECK-NEXT: .LBB10_7: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT: add.w r3, r9, r8
-; CHECK-NEXT: add.w r9, r9, #1
-; CHECK-NEXT: strh.w r10, [r2, r3, lsl #1]
-; CHECK-NEXT: ldr r3, [sp, #112]
-; CHECK-NEXT: cmp r9, r3
+; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: add.w r0, r8, r5
+; CHECK-NEXT: add.w r8, r8, #1
+; CHECK-NEXT: cmp r8, r7
+; CHECK-NEXT: strh.w r10, [r3, r0, lsl #1]
; CHECK-NEXT: beq .LBB10_4
; CHECK-NEXT: .LBB10_8: @ %for.cond5.preheader.us.us
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
@@ -690,48 +692,46 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: b .LBB10_13
; CHECK-NEXT: .LBB10_10: @ %vector.ph
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT: adr r3, .LCPI10_0
-; CHECK-NEXT: vmov q4, q1
-; CHECK-NEXT: vldrw.u32 q5, [r3]
-; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: vmov.i32 q3, #0x0
-; CHECK-NEXT: dls lr, r3
-; CHECK-NEXT: vmlas.i32 q4, q5, r9
-; CHECK-NEXT: ldr r3, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: vmov q5, q1
+; CHECK-NEXT: vmov.i32 q4, #0x0
+; CHECK-NEXT: vmlas.i32 q5, q2, r8
+; CHECK-NEXT: dls lr, r0
+; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: .LBB10_11: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT: vadd.i32 q5, q4, q2
-; CHECK-NEXT: vldrh.s32 q6, [r1, q4, uxtw #1]
-; CHECK-NEXT: vldrh.s32 q4, [r3], #8
-; CHECK-NEXT: vmul.i32 q4, q6, q4
-; CHECK-NEXT: vadd.i32 q3, q4, q3
-; CHECK-NEXT: vmov q4, q5
+; CHECK-NEXT: vadd.i32 q6, q5, q3
+; CHECK-NEXT: vldrh.s32 q7, [r1, q5, uxtw #1]
+; CHECK-NEXT: vldrh.s32 q5, [r3], #8
+; CHECK-NEXT: vmul.i32 q5, q7, q5
+; CHECK-NEXT: vadd.i32 q4, q5, q4
+; CHECK-NEXT: vmov q5, q6
; CHECK-NEXT: le lr, .LBB10_11
; CHECK-NEXT: @ %bb.12: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT: vaddv.u32 r10, q3
-; CHECK-NEXT: cmp r0, r12
-; CHECK-NEXT: mov r4, r0
+; CHECK-NEXT: vaddv.u32 r10, q4
+; CHECK-NEXT: cmp r2, r12
+; CHECK-NEXT: mov r4, r2
; CHECK-NEXT: beq .LBB10_7
; CHECK-NEXT: .LBB10_13: @ %for.body8.us.us.preheader
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT: ldr r3, [sp, #112]
+; CHECK-NEXT: mla r3, r7, r4, r8
+; CHECK-NEXT: add.w r0, r11, r4
+; CHECK-NEXT: ldr r7, [sp, #24] @ 4-byte Reload
; CHECK-NEXT: sub.w lr, r12, r4
-; CHECK-NEXT: ldr r5, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT: mla r3, r3, r4, r9
-; CHECK-NEXT: add.w r11, r1, r3, lsl #1
-; CHECK-NEXT: adds r3, r7, r4
-; CHECK-NEXT: add.w r3, r5, r3, lsl #1
+; CHECK-NEXT: add.w r9, r7, r0, lsl #1
+; CHECK-NEXT: ldr r7, [sp, #136]
+; CHECK-NEXT: add.w r3, r1, r3, lsl #1
; CHECK-NEXT: .LBB10_14: @ %for.body8.us.us
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT: ldrsh.w r5, [r11]
-; CHECK-NEXT: add r11, r6
-; CHECK-NEXT: ldrsh r4, [r3], #2
-; CHECK-NEXT: smlabb r10, r5, r4, r10
+; CHECK-NEXT: ldrsh.w r4, [r3]
+; CHECK-NEXT: add r3, r6
+; CHECK-NEXT: ldrsh r0, [r9], #2
+; CHECK-NEXT: smlabb r10, r4, r0, r10
; CHECK-NEXT: le lr, .LBB10_14
; CHECK-NEXT: b .LBB10_7
; CHECK-NEXT: .LBB10_15: @ Parent Loop BB10_5 Depth=1
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
index 9dc2d62f21d95a..6633cec659d8e5 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
@@ -121,29 +121,31 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_ashr_i32 s7, s6, 31
; CHECK-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CHECK-NEXT: s_add_u32 s4, s0, s4
-; CHECK-NEXT: s_addc_u32 s5, s1, s5
-; CHECK-NEXT: s_add_u32 s0, s4, -8
-; CHECK-NEXT: s_addc_u32 s1, s5, -1
-; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 9
-; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0
+; CHECK-NEXT: s_add_u32 s0, s0, s4
+; CHECK-NEXT: s_addc_u32 s1, s1, s5
+; CHECK-NEXT: s_add_u32 s4, s0, -8
+; CHECK-NEXT: s_addc_u32 s5, s1, -1
+; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 9
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0
; CHECK-NEXT: .LBB3_1: ; %bb0
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
; CHECK-NEXT: s_cbranch_vccnz .LBB3_1
; CHECK-NEXT: ; %bb.2: ; %bb1
-; CHECK-NEXT: s_mov_b64 s[4:5], exec
-; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
-; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; CHECK-NEXT: s_mov_b64 s[0:1], exec
+; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc
; CHECK-NEXT: s_cbranch_execz .LBB3_4
; CHECK-NEXT: ; %bb.3:
-; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s4
+; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; CHECK-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 0
-; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_wbinvl1_vol
; CHECK-NEXT: .LBB3_4:
More information about the llvm-commits
mailing list