[llvm] [AMDGPU] Improve DPP combining for fadd (PR #100148)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 23 09:16:17 PDT 2024
https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/100148
Recognize identities for fadd and fadd nsz.
>From fbf687b508c6826f7dcc940c8161f462a8e4e975 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 23 Jul 2024 17:13:26 +0100
Subject: [PATCH] [AMDGPU] Improve DPP combining for fadd
Recognize identities for fadd and fadd nsz.
---
llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp | 16 +-
.../AMDGPU/global_atomics_scan_fadd.ll | 832 ++++++-----------
.../AMDGPU/global_atomics_scan_fsub.ll | 880 ++++++------------
3 files changed, 569 insertions(+), 1159 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 0ac079c69e605..37b783cccd5cd 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -430,9 +430,9 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
return DPPInst.getInstr();
}
-static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) {
+static bool isIdentityValue(const MachineInstr &MI, MachineOperand *OldOpnd) {
assert(OldOpnd->isImm());
- switch (OrigMIOp) {
+ switch (MI.getOpcode()) {
default: break;
case AMDGPU::V_ADD_U32_e32:
case AMDGPU::V_ADD_U32_e64:
@@ -478,6 +478,16 @@ static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) {
if (OldOpnd->getImm() == 1)
return true;
break;
+ case AMDGPU::V_ADD_F32_e32:
+ case AMDGPU::V_ADD_F32_e64:
+ // -0.0 is an identity for fadd.
+ if (static_cast<uint32_t>(OldOpnd->getImm()) == 0x80000000)
+ return true;
+ // +0.0 is an identity for fadd except for the sign of a zero result.
+ if (MI.getFlag(MachineInstr::FmNsz) &&
+ static_cast<uint32_t>(OldOpnd->getImm()) == 0)
+ return true;
+ break;
}
return false;
}
@@ -492,7 +502,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(
LLVM_DEBUG(dbgs() << " failed: no src1 or it isn't a register\n");
return nullptr;
}
- if (!isIdentityValue(OrigMI.getOpcode(), OldOpndValue)) {
+ if (!isIdentityValue(OrigMI, OldOpndValue)) {
LLVM_DEBUG(dbgs() << " failed: old immediate isn't an identity\n");
return nullptr;
}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index c05f9c679979d..41061456cd130 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -814,37 +814,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -898,25 +883,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
@@ -934,15 +909,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT: global_load_dword v1, v5, s[0:1]
; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v5, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
@@ -980,25 +955,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
@@ -1011,14 +976,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT: global_load_dword v1, v5, s[0:1]
; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v5, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
@@ -1046,54 +1011,41 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1164-DPP-NEXT: s_not_b64 exec, exec
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT: global_atomic_add_f32 v3, v0, s[0:1]
; GFX1164-DPP-NEXT: .LBB1_2:
; GFX1164-DPP-NEXT: s_endpgm
;
@@ -1115,46 +1067,34 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT: global_atomic_add_f32 v3, v0, s[0:1]
; GFX1132-DPP-NEXT: .LBB1_2:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
@@ -2138,37 +2078,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -2222,25 +2147,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
@@ -2258,15 +2173,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT: global_load_dword v1, v5, s[0:1]
; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v5, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
@@ -2304,25 +2219,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
@@ -2335,14 +2240,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT: global_load_dword v1, v5, s[0:1]
; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v5, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
@@ -2370,63 +2275,50 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1164-DPP-NEXT: s_not_b64 exec, exec
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
+; GFX1164-DPP-NEXT: global_load_b32 v4, v5, s[0:1]
; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v4, v5, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1164-DPP-NEXT: v_add_f32_e32 v3, v4, v0
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v5, v[3:4], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
@@ -2452,55 +2344,43 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
+; GFX1132-DPP-NEXT: global_load_b32 v4, v5, s[0:1]
; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f32_e32 v4, v5, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1132-DPP-NEXT: v_add_f32_e32 v3, v4, v0
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v5, v[3:4], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
@@ -3462,37 +3342,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -3546,25 +3411,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
@@ -3582,15 +3437,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT: global_load_dword v1, v5, s[0:1]
; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v5, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
@@ -3628,25 +3483,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
@@ -3659,14 +3504,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT: global_load_dword v1, v5, s[0:1]
; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v5, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
@@ -3694,54 +3539,41 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1164-DPP-NEXT: s_not_b64 exec, exec
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT: global_atomic_add_f32 v3, v0, s[0:1]
; GFX1164-DPP-NEXT: .LBB5_2:
; GFX1164-DPP-NEXT: s_endpgm
;
@@ -3763,46 +3595,34 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT: global_atomic_add_f32 v3, v0, s[0:1]
; GFX1132-DPP-NEXT: .LBB5_2:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
@@ -4256,37 +4076,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -4340,25 +4145,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
@@ -4376,15 +4171,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT: global_load_dword v1, v5, s[0:1]
; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v5, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
@@ -4422,25 +4217,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
@@ -4453,14 +4238,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT: global_load_dword v1, v5, s[0:1]
; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v5, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
@@ -4488,54 +4273,41 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1164-DPP-NEXT: s_not_b64 exec, exec
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT: global_atomic_add_f32 v3, v0, s[0:1]
; GFX1164-DPP-NEXT: .LBB6_2:
; GFX1164-DPP-NEXT: s_endpgm
;
@@ -4557,46 +4329,34 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_2
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT: global_atomic_add_f32 v3, v0, s[0:1]
; GFX1132-DPP-NEXT: .LBB6_2:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value() strictfp
@@ -5579,37 +5339,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -5663,25 +5408,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
@@ -5699,15 +5434,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT: global_load_dword v1, v5, s[0:1]
; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v5, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
@@ -5745,25 +5480,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
@@ -5776,14 +5501,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT: global_load_dword v1, v5, s[0:1]
; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v5, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
@@ -5811,63 +5536,50 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1164-DPP-NEXT: s_not_b64 exec, exec
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
+; GFX1164-DPP-NEXT: global_load_b32 v4, v5, s[0:1]
; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v4, v5, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1164-DPP-NEXT: v_add_f32_e32 v3, v4, v0
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v5, v[3:4], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
@@ -5893,55 +5605,43 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
+; GFX1132-DPP-NEXT: global_load_b32 v4, v5, s[0:1]
; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f32_e32 v4, v5, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1132-DPP-NEXT: v_add_f32_e32 v3, v4, v0
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v5, v[3:4], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 5ffa71d37164c..2cd1df1f2d20b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -892,37 +892,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -976,25 +961,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
@@ -1012,15 +987,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT: global_load_dword v1, v5, s[0:1]
; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v5, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
@@ -1058,25 +1033,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
@@ -1089,14 +1054,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT: global_load_dword v1, v5, s[0:1]
; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v5, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
@@ -1124,63 +1089,50 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1164-DPP-NEXT: s_not_b64 exec, exec
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
+; GFX1164-DPP-NEXT: global_load_b32 v4, v5, s[0:1]
; GFX1164-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v3, v4, v0
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v5, v[3:4], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
@@ -1206,55 +1158,43 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
+; GFX1132-DPP-NEXT: global_load_b32 v4, v5, s[0:1]
; GFX1132-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1132-DPP-NEXT: v_sub_f32_e32 v3, v4, v0
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v5, v[3:4], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
@@ -2242,37 +2182,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -2326,25 +2251,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
@@ -2362,15 +2277,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT: global_load_dword v1, v5, s[0:1]
; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v5, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
@@ -2408,25 +2323,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
@@ -2439,14 +2344,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT: global_load_dword v1, v5, s[0:1]
; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v5, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
@@ -2474,63 +2379,50 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1164-DPP-NEXT: s_not_b64 exec, exec
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
+; GFX1164-DPP-NEXT: global_load_b32 v4, v5, s[0:1]
; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v3, v4, v0
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v5, v[3:4], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
@@ -2556,55 +2448,43 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
+; GFX1132-DPP-NEXT: global_load_b32 v4, v5, s[0:1]
; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1132-DPP-NEXT: v_sub_f32_e32 v3, v4, v0
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v5, v[3:4], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
@@ -3592,37 +3472,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -3676,25 +3541,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
@@ -3712,15 +3567,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT: global_load_dword v1, v5, s[0:1]
; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v5, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
@@ -3758,25 +3613,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
@@ -3789,14 +3634,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT: global_load_dword v1, v5, s[0:1]
; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v5, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
@@ -3824,63 +3669,50 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1164-DPP-NEXT: s_not_b64 exec, exec
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
+; GFX1164-DPP-NEXT: global_load_b32 v4, v5, s[0:1]
; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v3, v4, v0
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v5, v[3:4], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
@@ -3906,55 +3738,43 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
+; GFX1132-DPP-NEXT: global_load_b32 v4, v5, s[0:1]
; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1132-DPP-NEXT: v_sub_f32_e32 v3, v4, v0
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v5, v[3:4], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
@@ -4438,37 +4258,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -4522,25 +4327,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
@@ -4558,15 +4353,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT: global_load_dword v1, v5, s[0:1]
; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v5, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
@@ -4604,25 +4399,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
@@ -4635,14 +4420,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT: global_load_dword v1, v5, s[0:1]
; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v5, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
@@ -4670,63 +4455,50 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1164-DPP-NEXT: s_not_b64 exec, exec
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
+; GFX1164-DPP-NEXT: global_load_b32 v4, v5, s[0:1]
; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v3, v4, v0
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v5, v[3:4], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
@@ -4752,55 +4524,43 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
+; GFX1132-DPP-NEXT: global_load_b32 v4, v5, s[0:1]
; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1132-DPP-NEXT: v_sub_f32_e32 v3, v4, v0
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v5, v[3:4], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
@@ -5787,37 +5547,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -5871,25 +5616,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
@@ -5907,15 +5642,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT: global_load_dword v1, v5, s[0:1]
; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v5, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
@@ -5953,25 +5688,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v0
; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
@@ -5984,14 +5709,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT: global_load_dword v1, v5, s[0:1]
; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v5, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
@@ -6019,63 +5744,50 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1164-DPP-NEXT: s_not_b64 exec, exec
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
+; GFX1164-DPP-NEXT: global_load_b32 v4, v5, s[0:1]
; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v3, v4, v0
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v5, v[3:4], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
@@ -6101,55 +5813,43 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
+; GFX1132-DPP-NEXT: global_load_b32 v4, v5, s[0:1]
; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1132-DPP-NEXT: v_sub_f32_e32 v3, v4, v0
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v5, v[3:4], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3
; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
More information about the llvm-commits
mailing list