[llvm] [AMDGPU] Move WWM register pre-allocation to during regalloc (PR #70618)

Carl Ritson via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 2 02:13:53 PDT 2023


https://github.com/perlfu updated https://github.com/llvm/llvm-project/pull/70618

>From 04174a223f8e68b80276c080954858a11cabe28e Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Mon, 30 Oct 2023 09:54:24 +0900
Subject: [PATCH 1/3] [AMDGPU] Move WWM register pre-allocation to during
 regalloc

Move SIPreAllocateWWMRegs pass to just before VGPR allocation.
This saves recomputation of the virtual matrix and live reg map,
with the slight regression in O0 that live intervals and slot
indexes must be computed.
---
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |    4 +-
 .../Target/AMDGPU/SIPreAllocateWWMRegs.cpp    |    4 +-
 .../AMDGPU/bb-prolog-spill-during-regalloc.ll |    2 +-
 .../AMDGPU/global_atomics_scan_fadd.ll        | 1152 ++++++++--------
 .../AMDGPU/global_atomics_scan_fmax.ll        |  927 +++++++------
 .../AMDGPU/global_atomics_scan_fmin.ll        |  927 +++++++------
 .../AMDGPU/global_atomics_scan_fsub.ll        | 1200 ++++++++---------
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |   24 +-
 .../CodeGen/AMDGPU/sgpr-regalloc-flags.ll     |    9 +
 .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll |  517 ++++---
 llvm/test/CodeGen/AMDGPU/wwm-reserved.ll      |  452 +++----
 11 files changed, 2563 insertions(+), 2655 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 375df27206f7b41..5b8b89ad6ebf19d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1277,7 +1277,6 @@ void GCNPassConfig::addFastRegAlloc() {
   insertPass(&PHIEliminationID, &SILowerControlFlowID);
 
   insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
-  insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID);
 
   TargetPassConfig::addFastRegAlloc();
 }
@@ -1286,7 +1285,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
   // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
   // instructions that cause scheduling barriers.
   insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
-  insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID);
 
   if (OptExecMaskPreRA)
     insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
@@ -1373,6 +1371,7 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() {
 
   // Equivalent of PEI for SGPRs.
   addPass(&SILowerSGPRSpillsID);
+  addPass(&SIPreAllocateWWMRegsID);
 
   addPass(createVGPRAllocPass(false));
 
@@ -1396,6 +1395,7 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
 
   // Equivalent of PEI for SGPRs.
   addPass(&SILowerSGPRSpillsID);
+  addPass(&SIPreAllocateWWMRegsID);
 
   addPass(createVGPRAllocPass(true));
 
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index c2ddfd7881ab760..fc35bec0edd3bdb 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -56,11 +56,9 @@ class SIPreAllocateWWMRegs : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LiveIntervals>();
-    AU.addPreserved<LiveIntervals>();
     AU.addRequired<VirtRegMap>();
     AU.addRequired<LiveRegMatrix>();
-    AU.addPreserved<SlotIndexes>();
-    AU.setPreservesCFG();
+    AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
index e22cb912552f970..28780fdf441381c 100644
--- a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
@@ -11,7 +11,7 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
   ; REGALLOC-NEXT:   renamable $vgpr3 = IMPLICIT_DEF
   ; REGALLOC-NEXT:   SI_SPILL_V32_SAVE killed $vgpr2, %stack.5, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
   ; REGALLOC-NEXT:   SI_SPILL_V32_SAVE killed $vgpr1, %stack.4, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
-  ; REGALLOC-NEXT:   renamable $vgpr1 = COPY killed $vgpr0
+  ; REGALLOC-NEXT:   renamable $vgpr1 = COPY $vgpr0
   ; REGALLOC-NEXT:   $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
   ; REGALLOC-NEXT:   renamable $sgpr4 = S_MOV_B32 49
   ; REGALLOC-NEXT:   renamable $sgpr4_sgpr5 = V_CMP_GT_I32_e64 killed $vgpr1, killed $sgpr4, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 4cbd5e84871cc75..6844bebb8ab7623 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -725,36 +725,36 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -809,50 +809,50 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 32
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v40, s2, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX1064-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -892,44 +892,44 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -959,53 +959,53 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_atomic_add_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT:    global_atomic_add_f32 v1, v0, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB1_2:
 ; GFX1164-DPP-NEXT:    s_nop 0
 ; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1031,45 +1031,45 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_atomic_add_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT:    global_atomic_add_f32 v1, v0, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB1_2:
 ; GFX1132-DPP-NEXT:    s_nop 0
 ; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1953,36 +1953,36 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -2037,50 +2037,50 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 32
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v40, s2, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX1064-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -2120,44 +2120,44 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -2187,62 +2187,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v4, v5, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -2270,54 +2270,54 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v4, v5, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
@@ -3181,36 +3181,36 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3265,50 +3265,50 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 32
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v40, s2, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX1064-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -3348,44 +3348,44 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -3415,53 +3415,53 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_atomic_add_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT:    global_atomic_add_f32 v1, v0, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB5_2:
 ; GFX1164-DPP-NEXT:    s_nop 0
 ; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3487,45 +3487,45 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_atomic_add_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT:    global_atomic_add_f32 v1, v0, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB5_2:
 ; GFX1132-DPP-NEXT:    s_nop 0
 ; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3929,36 +3929,36 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -4013,50 +4013,50 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 32
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v40, s2, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX1064-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -4096,44 +4096,44 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -4163,53 +4163,53 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_atomic_add_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT:    global_atomic_add_f32 v1, v0, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB6_2:
 ; GFX1164-DPP-NEXT:    s_nop 0
 ; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4235,45 +4235,45 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_atomic_add_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT:    global_atomic_add_f32 v1, v0, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB6_2:
 ; GFX1132-DPP-NEXT:    s_nop 0
 ; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5156,36 +5156,36 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -5240,50 +5240,50 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 32
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v40, s2, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX1064-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -5323,44 +5323,44 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -5390,62 +5390,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v4, v5, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -5473,54 +5473,54 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v4, v5, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index e07514b063ee481..29f6ce7a8331ccd 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -787,43 +787,43 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -832,14 +832,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
+; GFX9-DPP-NEXT:    v_max_f32_e64 v3, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX9-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -880,51 +880,51 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0xff800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v3, s2, s2
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v4, s3, s3
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 32
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 0
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v40, s2, s2
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v41, s3, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
@@ -932,7 +932,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -973,36 +973,36 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0xff800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -1010,14 +1010,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -1048,75 +1048,75 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -1144,63 +1144,62 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0xff800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0xff800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0xff800000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v41, v41 :: v_dual_mov_b32 v42, 0xff800000
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0xff800000
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0xff800000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_max_f32 v3, v0, v0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
@@ -1986,43 +1985,43 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -2031,14 +2030,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
+; GFX9-DPP-NEXT:    v_max_f32_e64 v3, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX9-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -2079,51 +2078,51 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0xff800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v3, s2, s2
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v4, s3, s3
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 32
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 0
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v40, s2, s2
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v41, s3, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
@@ -2131,7 +2130,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -2172,36 +2171,36 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0xff800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -2209,14 +2208,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -2247,75 +2246,75 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -2343,63 +2342,62 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0xff800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0xff800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0xff800000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v41, v41 :: v_dual_mov_b32 v42, 0xff800000
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0xff800000
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0xff800000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_max_f32 v3, v0, v0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
@@ -3185,43 +3183,43 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3230,14 +3228,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
+; GFX9-DPP-NEXT:    v_max_f32_e64 v3, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX9-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -3278,51 +3276,51 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0xff800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v3, s2, s2
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v4, s3, s3
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 32
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 0
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v40, s2, s2
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v41, s3, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
@@ -3330,7 +3328,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -3371,36 +3369,36 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0xff800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -3408,14 +3406,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -3446,75 +3444,75 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -3542,63 +3540,62 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0xff800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0xff800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0xff800000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v41, v41 :: v_dual_mov_b32 v42, 0xff800000
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0xff800000
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0xff800000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_max_f32 v3, v0, v0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index ad6edbd2c37a09d..a760f963b54eca4 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -787,43 +787,43 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -832,14 +832,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
+; GFX9-DPP-NEXT:    v_max_f32_e64 v3, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX9-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
+; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -880,51 +880,51 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7f800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v3, s2, s2
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v4, s3, s3
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 32
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 0
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v40, s2, s2
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v41, s3, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
@@ -932,7 +932,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -973,36 +973,36 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7f800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -1010,14 +1010,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -1048,75 +1048,75 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -1144,63 +1144,62 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7f800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7f800000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v41, v41 :: v_dual_mov_b32 v42, 0x7f800000
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0x7f800000
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_max_f32 v3, v0, v0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
@@ -1986,43 +1985,43 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -2031,14 +2030,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
+; GFX9-DPP-NEXT:    v_max_f32_e64 v3, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX9-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
+; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -2079,51 +2078,51 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7f800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v3, s2, s2
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v4, s3, s3
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 32
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 0
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v40, s2, s2
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v41, s3, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
@@ -2131,7 +2130,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -2172,36 +2171,36 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7f800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -2209,14 +2208,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -2247,75 +2246,75 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -2343,63 +2342,62 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7f800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7f800000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v41, v41 :: v_dual_mov_b32 v42, 0x7f800000
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0x7f800000
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_max_f32 v3, v0, v0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
@@ -3185,43 +3183,43 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3230,14 +3228,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
+; GFX9-DPP-NEXT:    v_max_f32_e64 v3, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX9-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
+; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -3278,51 +3276,51 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7f800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v3, s2, s2
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v4, s3, s3
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 32
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 0
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v40, s2, s2
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v41, s3, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
@@ -3330,7 +3328,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -3371,36 +3369,36 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7f800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -3408,14 +3406,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -3446,75 +3444,75 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -3542,63 +3540,62 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7f800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7f800000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v41, v41 :: v_dual_mov_b32 v42, 0x7f800000
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0x7f800000
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_max_f32 v3, v0, v0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 4a00d7bc71bca8a..b6f8c9392601318 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -799,36 +799,36 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -883,50 +883,50 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 32
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v40, s2, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX1064-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -966,44 +966,44 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -1033,62 +1033,62 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_sub_f32_e32 v4, v5, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -1116,54 +1116,54 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_sub_f32_e32 v4, v5, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
@@ -2049,36 +2049,36 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -2133,50 +2133,50 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 32
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v40, s2, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX1064-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -2216,44 +2216,44 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -2283,62 +2283,62 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_sub_f32_e32 v4, v5, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -2366,54 +2366,54 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_sub_f32_e32 v4, v5, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
@@ -3299,36 +3299,36 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3383,50 +3383,50 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 32
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v40, s2, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX1064-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -3466,44 +3466,44 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -3533,62 +3533,62 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_sub_f32_e32 v4, v5, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -3616,54 +3616,54 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_sub_f32_e32 v4, v5, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
@@ -4091,36 +4091,36 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -4175,50 +4175,50 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 32
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v40, s2, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX1064-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -4258,44 +4258,44 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -4325,62 +4325,62 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_sub_f32_e32 v4, v5, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -4408,54 +4408,54 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_sub_f32_e32 v4, v5, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
@@ -5340,36 +5340,36 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -5424,50 +5424,50 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 32
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v40, s2, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX1064-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -5507,44 +5507,44 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -5574,62 +5574,62 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_sub_f32_e32 v4, v5, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -5657,54 +5657,54 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v40
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_sub_f32_e32 v4, v5, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 5e922f608c1daa7..7abb789019f1f0c 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -106,12 +106,14 @@
 ; GCN-O0-NEXT:        Slot index numbering
 ; GCN-O0-NEXT:        Live Interval Analysis
 ; GCN-O0-NEXT:        SI Whole Quad Mode
-; GCN-O0-NEXT:        Virtual Register Map
-; GCN-O0-NEXT:        Live Register Matrix
-; GCN-O0-NEXT:        SI Pre-allocate WWM Registers
 ; GCN-O0-NEXT:        AMDGPU Pre-RA Long Branch Reg
 ; GCN-O0-NEXT:        Fast Register Allocator
 ; GCN-O0-NEXT:        SI lower SGPR spill instructions
+; GCN-O0-NEXT:        Slot index numbering
+; GCN-O0-NEXT:        Live Interval Analysis
+; GCN-O0-NEXT:        Virtual Register Map
+; GCN-O0-NEXT:        Live Register Matrix
+; GCN-O0-NEXT:        SI Pre-allocate WWM Registers
 ; GCN-O0-NEXT:        Fast Register Allocator
 ; GCN-O0-NEXT:        SI Lower WWM Copies
 ; GCN-O0-NEXT:        SI Fix VGPR copies
@@ -334,9 +336,6 @@
 ; GCN-O1-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O1-NEXT:        Machine Instruction Scheduler
 ; GCN-O1-NEXT:        SI Whole Quad Mode
-; GCN-O1-NEXT:        Virtual Register Map
-; GCN-O1-NEXT:        Live Register Matrix
-; GCN-O1-NEXT:        SI Pre-allocate WWM Registers
 ; GCN-O1-NEXT:        SI optimize exec mask operations pre-RA
 ; GCN-O1-NEXT:        AMDGPU Pre-RA Long Branch Reg
 ; GCN-O1-NEXT:        Machine Natural Loop Construction
@@ -354,6 +353,7 @@
 ; GCN-O1-NEXT:        SI lower SGPR spill instructions
 ; GCN-O1-NEXT:        Virtual Register Map
 ; GCN-O1-NEXT:        Live Register Matrix
+; GCN-O1-NEXT:        SI Pre-allocate WWM Registers
 ; GCN-O1-NEXT:        Greedy Register Allocator
 ; GCN-O1-NEXT:        SI Lower WWM Copies
 ; GCN-O1-NEXT:        GCN NSA Reassign
@@ -625,9 +625,6 @@
 ; GCN-O1-OPTS-NEXT:        AMDGPU Pre-RA optimizations
 ; GCN-O1-OPTS-NEXT:        Machine Instruction Scheduler
 ; GCN-O1-OPTS-NEXT:        SI Whole Quad Mode
-; GCN-O1-OPTS-NEXT:        Virtual Register Map
-; GCN-O1-OPTS-NEXT:        Live Register Matrix
-; GCN-O1-OPTS-NEXT:        SI Pre-allocate WWM Registers
 ; GCN-O1-OPTS-NEXT:        SI optimize exec mask operations pre-RA
 ; GCN-O1-OPTS-NEXT:        AMDGPU Pre-RA Long Branch Reg
 ; GCN-O1-OPTS-NEXT:        Machine Natural Loop Construction
@@ -645,6 +642,7 @@
 ; GCN-O1-OPTS-NEXT:        SI lower SGPR spill instructions
 ; GCN-O1-OPTS-NEXT:        Virtual Register Map
 ; GCN-O1-OPTS-NEXT:        Live Register Matrix
+; GCN-O1-OPTS-NEXT:        SI Pre-allocate WWM Registers
 ; GCN-O1-OPTS-NEXT:        Greedy Register Allocator
 ; GCN-O1-OPTS-NEXT:        SI Lower WWM Copies
 ; GCN-O1-OPTS-NEXT:        GCN NSA Reassign
@@ -927,9 +925,6 @@
 ; GCN-O2-NEXT:        AMDGPU Pre-RA optimizations
 ; GCN-O2-NEXT:        Machine Instruction Scheduler
 ; GCN-O2-NEXT:        SI Whole Quad Mode
-; GCN-O2-NEXT:        Virtual Register Map
-; GCN-O2-NEXT:        Live Register Matrix
-; GCN-O2-NEXT:        SI Pre-allocate WWM Registers
 ; GCN-O2-NEXT:        SI optimize exec mask operations pre-RA
 ; GCN-O2-NEXT:        SI Form memory clauses
 ; GCN-O2-NEXT:        AMDGPU Pre-RA Long Branch Reg
@@ -948,6 +943,7 @@
 ; GCN-O2-NEXT:        SI lower SGPR spill instructions
 ; GCN-O2-NEXT:        Virtual Register Map
 ; GCN-O2-NEXT:        Live Register Matrix
+; GCN-O2-NEXT:        SI Pre-allocate WWM Registers
 ; GCN-O2-NEXT:        Greedy Register Allocator
 ; GCN-O2-NEXT:        SI Lower WWM Copies
 ; GCN-O2-NEXT:        GCN NSA Reassign
@@ -1242,9 +1238,6 @@
 ; GCN-O3-NEXT:        AMDGPU Pre-RA optimizations
 ; GCN-O3-NEXT:        Machine Instruction Scheduler
 ; GCN-O3-NEXT:        SI Whole Quad Mode
-; GCN-O3-NEXT:        Virtual Register Map
-; GCN-O3-NEXT:        Live Register Matrix
-; GCN-O3-NEXT:        SI Pre-allocate WWM Registers
 ; GCN-O3-NEXT:        SI optimize exec mask operations pre-RA
 ; GCN-O3-NEXT:        SI Form memory clauses
 ; GCN-O3-NEXT:        AMDGPU Pre-RA Long Branch Reg
@@ -1263,6 +1256,7 @@
 ; GCN-O3-NEXT:        SI lower SGPR spill instructions
 ; GCN-O3-NEXT:        Virtual Register Map
 ; GCN-O3-NEXT:        Live Register Matrix
+; GCN-O3-NEXT:        SI Pre-allocate WWM Registers
 ; GCN-O3-NEXT:        Greedy Register Allocator
 ; GCN-O3-NEXT:        SI Lower WWM Copies
 ; GCN-O3-NEXT:        GCN NSA Reassign
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
index ba506a88e1665f1..7c8507fe4559fb4 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
@@ -20,6 +20,7 @@
 ; DEFAULT-NEXT: SI lower SGPR spill instructions
 ; DEFAULT-NEXT: Virtual Register Map
 ; DEFAULT-NEXT: Live Register Matrix
+; DEFAULT-NEXT: SI Pre-allocate WWM Registers
 ; DEFAULT-NEXT: Greedy Register Allocator
 ; DEFAULT-NEXT: SI Lower WWM Copies
 ; DEFAULT-NEXT: GCN NSA Reassign
@@ -28,6 +29,11 @@
 
 ; O0: Fast Register Allocator
 ; O0-NEXT: SI lower SGPR spill instructions
+; O0-NEXT: Slot index numbering
+; O0-NEXT: Live Interval Analysis
+; O0-NEXT: Virtual Register Map
+; O0-NEXT: Live Register Matrix
+; O0-NEXT: SI Pre-allocate WWM Registers
 ; O0-NEXT: Fast Register Allocator
 ; O0-NEXT: SI Lower WWM Copies
 ; O0-NEXT: SI Fix VGPR copies
@@ -46,6 +52,7 @@
 ; BASIC-DEFAULT-NEXT: SI lower SGPR spill instructions
 ; BASIC-DEFAULT-NEXT: Virtual Register Map
 ; BASIC-DEFAULT-NEXT: Live Register Matrix
+; BASIC-DEFAULT-NEXT: SI Pre-allocate WWM Registers
 ; BASIC-DEFAULT-NEXT: Bundle Machine CFG Edges
 ; BASIC-DEFAULT-NEXT: Spill Code Placement Analysis
 ; BASIC-DEFAULT-NEXT: Lazy Machine Block Frequency Analysis
@@ -63,6 +70,7 @@
 ; DEFAULT-BASIC-NEXT: SI lower SGPR spill instructions
 ; DEFAULT-BASIC-NEXT: Virtual Register Map
 ; DEFAULT-BASIC-NEXT: Live Register Matrix
+; DEFAULT-BASIC-NEXT: SI Pre-allocate WWM Registers
 ; DEFAULT-BASIC-NEXT: Basic Register Allocator
 ; DEFAULT-BASIC-NEXT: SI Lower WWM Copies
 ; DEFAULT-BASIC-NEXT: GCN NSA Reassign
@@ -82,6 +90,7 @@
 ; BASIC-BASIC-NEXT: SI lower SGPR spill instructions
 ; BASIC-BASIC-NEXT: Virtual Register Map
 ; BASIC-BASIC-NEXT: Live Register Matrix
+; BASIC-BASIC-NEXT: SI Pre-allocate WWM Registers
 ; BASIC-BASIC-NEXT: Basic Register Allocator
 ; BASIC-BASIC-NEXT: SI Lower WWM Copies
 ; BASIC-BASIC-NEXT: GCN NSA Reassign
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index d2340872eb545bb..b1aaae39af75263 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -371,13 +371,13 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    s_mov_b32 s48, s33
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x400
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s40, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s4
 ; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
@@ -393,9 +393,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    s_mov_b32 s38, s43
 ; GFX9-O0-NEXT:    s_mov_b32 s39, s42
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s34
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
 ; GFX9-O0-NEXT:    s_getpc_b64 s[42:43]
@@ -405,19 +405,19 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    s_mov_b64 s[44:45], s[0:1]
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[44:45]
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[46:47]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[42:43]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    v_add_u32_e64 v3, v4, v3
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffffc00
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s48
@@ -430,34 +430,34 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O3-NEXT:    s_mov_b32 s38, s33
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX9-O3-NEXT:    v_writelane_b32 v2, s30, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-O3-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-O3-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-O3-NEXT:    s_getpc_b64 s[36:37]
 ; GFX9-O3-NEXT:    s_add_u32 s36, s36, strict_wwm_called at rel32@lo+4
 ; GFX9-O3-NEXT:    s_addc_u32 s37, s37, strict_wwm_called at rel32@hi+12
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O3-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
-; GFX9-O3-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX9-O3-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-O3-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-O3-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s38
@@ -559,30 +559,29 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-LABEL: strict_wwm_call_i64:
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s48, s33
+; GFX9-O0-NEXT:    s_mov_b32 s46, s33
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x1000
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    v_writelane_b32 v10, s30, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v10, s31, 1
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr11 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s30, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s31, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s38, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s4
@@ -598,30 +597,27 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    s_mov_b32 s41, s45
 ; GFX9-O0-NEXT:    s_mov_b32 s42, s44
 ; GFX9-O0-NEXT:    s_mov_b32 s43, s35
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s40, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s41, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s42, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s43, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v11, s40, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v11, s41, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v11, s42, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v11, s43, 3
 ; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    s_mov_b32 s35, s9
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    s_mov_b64 s[36:37], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s36
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s37
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s36
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s37
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s34, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s35, 5
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-O0-NEXT:    v_writelane_b32 v11, s34, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v11, s35, 5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v9
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 32
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s34, v[8:9]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[13:14], s34, v[9:10]
 ; GFX9-O0-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-O0-NEXT:    s_add_u32 s34, s34, strict_wwm_called_i64 at gotpcrel32@lo+4
 ; GFX9-O0-NEXT:    s_addc_u32 s35, s35, strict_wwm_called_i64 at gotpcrel32@hi+12
@@ -630,56 +626,46 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    s_mov_b64 s[36:37], s[0:1]
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v13
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s34, v6, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s35, v6, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s36, v6, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s37, v6, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s38, v6, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s39, v6, 3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    v_readlane_b32 s34, v11, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s35, v11, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s36, v11, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s37, v11, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s38, v11, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s39, v11, 3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v2, s[40:41], v2, v4
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v3, s[40:41], v3, v5, s[40:41]
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v9, s[40:41], v12, v9
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v10, s[40:41], v13, v10, s[40:41]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v10
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
-; GFX9-O0-NEXT:    buffer_store_dwordx2 v[6:7], off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT:    v_readlane_b32 s31, v10, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s30, v10, 0
-; GFX9-O0-NEXT:    ; kill: killed $vgpr0
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4
+; GFX9-O0-NEXT:    v_readlane_b32 s31, v8, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s30, v8, 0
+; GFX9-O0-NEXT:    ; kill: killed $vgpr11
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffff000
-; GFX9-O0-NEXT:    s_mov_b32 s33, s48
+; GFX9-O0-NEXT:    s_mov_b32 s33, s46
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -689,54 +675,49 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    s_mov_b32 s40, s33
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_writelane_b32 v8, s30, 0
+; GFX9-O3-NEXT:    v_writelane_b32 v6, s30, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-O3-NEXT:    v_writelane_b32 v8, s31, 1
+; GFX9-O3-NEXT:    v_writelane_b32 v6, s31, 1
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    s_getpc_b64 s[36:37]
 ; GFX9-O3-NEXT:    s_add_u32 s36, s36, strict_wwm_called_i64 at gotpcrel32@lo+4
 ; GFX9-O3-NEXT:    s_addc_u32 s37, s37, strict_wwm_called_i64 at gotpcrel32@hi+12
 ; GFX9-O3-NEXT:    s_load_dwordx2 s[36:37], s[36:37], 0x0
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s8
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s9
+; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s8
+; GFX9-O3-NEXT:    v_mov_b32_e32 v8, s9
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[38:39], -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v1
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v7, vcc, v9, v7
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v8, vcc
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[38:39]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
-; GFX9-O3-NEXT:    v_readlane_b32 s31, v8, 1
-; GFX9-O3-NEXT:    v_readlane_b32 s30, v8, 0
+; GFX9-O3-NEXT:    v_readlane_b32 s31, v6, 1
+; GFX9-O3-NEXT:    v_readlane_b32 s30, v6, 0
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s40
@@ -922,64 +903,64 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
-; GFX9-O0-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_writelane_b32 v42, s64, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v42, s65, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v42, s66, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v42, s67, 3
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_writelane_b32 v32, s64, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v32, s65, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v32, s66, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v32, s67, 3
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
@@ -989,145 +970,145 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v43, s5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s11
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s12
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s13
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s14
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s15
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s16
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s17
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s18
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s19
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s20
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s21
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s22
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s23
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s24
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v47, s25
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v46, s26
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v45, s27
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v44, s28
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s29
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v47
-; GFX9-O0-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v46
-; GFX9-O0-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v45
-; GFX9-O0-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v44
-; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v25, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v26, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v27, v47
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v28, v46
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v29, v45
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v30, v44
 ; GFX9-O0-NEXT:    ; kill: def $vgpr31 killed $vgpr43 killed $exec
-; GFX9-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
@@ -1157,55 +1138,55 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
 ; GFX9-O0-NEXT:    s_mov_b64 s[34:35], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v11
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v34, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v35, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v35, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v36, v9
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v34, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v35, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v35, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v36, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v36, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v37, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v37, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v38, v7
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v36, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v37, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v37, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v38, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v38, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v39, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v39, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v40, v5
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v38, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v39, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v39, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v40, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v40, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v41, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v41, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v42, v3
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v40, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v41, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v41, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v42, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v33
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v34
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v32
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v33
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v36
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v35
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v37
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v38
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:20
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v36
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v37
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:28
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v38
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v39
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:24
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v41
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v42
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:36
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v40
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v41
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:32
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s5
@@ -1233,19 +1214,20 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v23, s27
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v24, s28
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v25, s29
-; GFX9-O0-NEXT:    v_readlane_b32 s67, v42, 3
-; GFX9-O0-NEXT:    v_readlane_b32 s66, v42, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s65, v42, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s64, v42, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s67, v32, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s66, v32, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s65, v32, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s64, v32, 0
 ; GFX9-O0-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
@@ -1255,8 +1237,7 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index c134cf7ea3aaa40..8e90bdec5cb111b 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -328,38 +328,32 @@ define hidden i32 @called(i32 %a) noinline {
 define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-LABEL: call:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_mov_b32 s32, 0x400
+; GFX9-O0-NEXT:    s_mov_b32 s32, 0
 ; GFX9-O0-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; GFX9-O0-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
 ; GFX9-O0-NEXT:    s_mov_b32 s26, -1
 ; GFX9-O0-NEXT:    s_mov_b32 s27, 0xe00000
 ; GFX9-O0-NEXT:    s_add_u32 s24, s24, s11
 ; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s10
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v7, s10, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v7, s11, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s10, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s11, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s13, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s12, s8
 ; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX9-O0-NEXT:    v_writelane_b32 v7, s4, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v7, s5, 3
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s5, 3
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v7, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v7, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v3, 1
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v7, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v7, 3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v3, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v3, 3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
@@ -373,23 +367,19 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    s_mov_b32 s17, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s3
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s19, 7
 ; GFX9-O0-NEXT:    s_mov_b32 s3, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s2
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s3, 8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s3
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 9
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 10
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s2, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s3, 10
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 56
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -405,34 +395,28 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[24:25]
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[26:27]
 ; GFX9-O0-NEXT:    s_mov_b32 s15, 20
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s15, v3
-; GFX9-O0-NEXT:    s_mov_b32 s15, 10
 ; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v4, s15, v4
-; GFX9-O0-NEXT:    v_or3_b32 v3, v5, v4, v3
+; GFX9-O0-NEXT:    s_mov_b32 s15, 10
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v5, s15, v5
+; GFX9-O0-NEXT:    v_or3_b32 v4, v6, v5, v4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 9
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 10
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    v_add_u32_e64 v3, v3, v6
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v3, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v3, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v3, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v3, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v3, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v3, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v3, 8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, v7
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s4 offset:4
-; GFX9-O0-NEXT:    ; kill: killed $vgpr0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s4 offset:4
+; GFX9-O0-NEXT:    ; kill: killed $vgpr3
 ; GFX9-O0-NEXT:    s_endpgm
 ;
 ; GFX9-O3-LABEL: call:
@@ -459,10 +443,10 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O3-NEXT:    s_mov_b64 s[2:3], s[14:15]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-O3-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[10:11]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
 ; GFX9-O3-NEXT:    s_endpgm
 
@@ -562,38 +546,32 @@ define i64 @called_i64(i64 %a) noinline {
 define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) {
 ; GFX9-O0-LABEL: call_i64:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_mov_b32 s32, 0x400
+; GFX9-O0-NEXT:    s_mov_b32 s32, 0
 ; GFX9-O0-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; GFX9-O0-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
 ; GFX9-O0-NEXT:    s_mov_b32 s26, -1
 ; GFX9-O0-NEXT:    s_mov_b32 s27, 0xe00000
 ; GFX9-O0-NEXT:    s_add_u32 s24, s24, s11
 ; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr12 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s10
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v12, s10, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v12, s11, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s10, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s11, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s13, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s12, s8
 ; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX9-O0-NEXT:    v_writelane_b32 v12, s4, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v12, s5, 3
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s5, 3
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v12, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v12, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v8, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v8, 1
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v12, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v12, 3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v8, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v8, 3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v0
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x2c
@@ -607,24 +585,20 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    s_mov_b32 s17, s15
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s8
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s19, 7
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s3
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s9
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 9
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s2, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s3, 9
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 60
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -634,11 +608,10 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    s_addc_u32 s0, s0, s1
 ; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
 ; GFX9-O0-NEXT:    s_mov_b32 s0, 32
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[10:11], s0, v[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[15:16], s0, v[12:13]
 ; GFX9-O0-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-O0-NEXT:    s_add_u32 s0, s0, called_i64 at gotpcrel32@lo+4
 ; GFX9-O0-NEXT:    s_addc_u32 s1, s1, called_i64 at gotpcrel32@hi+12
@@ -646,43 +619,36 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[24:25]
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[26:27]
 ; GFX9-O0-NEXT:    s_mov_b32 s15, 20
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s15, v3
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v9, s15, v9
 ; GFX9-O0-NEXT:    s_mov_b32 s15, 10
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v4, s15, v4
-; GFX9-O0-NEXT:    v_or3_b32 v3, v5, v4, v3
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v10, s15, v10
+; GFX9-O0-NEXT:    v_or3_b32 v9, v11, v10, v9
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v15
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v8, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v8, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v8, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v8, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v3, s[6:7], v3, v5
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v4, s[6:7], v4, v6, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v13
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v9, s[6:7], v9, v11
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v10, s[6:7], v10, v12, s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v10
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0
-; GFX9-O0-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], s4 offset:4
-; GFX9-O0-NEXT:    ; kill: killed $vgpr0
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], s4 offset:4
+; GFX9-O0-NEXT:    ; kill: killed $vgpr8
 ; GFX9-O0-NEXT:    s_endpgm
 ;
 ; GFX9-O3-LABEL: call_i64:
@@ -715,13 +681,13 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v6, vcc, v8, v6
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v7, vcc, v9, v7, vcc
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0 offset:4
 ; GFX9-O3-NEXT:    s_endpgm
 
@@ -1196,38 +1162,32 @@ define hidden i32 @strict_wwm_called(i32 %a) noinline {
 define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-LABEL: strict_wwm_call:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_mov_b32 s32, 0x400
+; GFX9-O0-NEXT:    s_mov_b32 s32, 0
 ; GFX9-O0-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; GFX9-O0-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
 ; GFX9-O0-NEXT:    s_mov_b32 s26, -1
 ; GFX9-O0-NEXT:    s_mov_b32 s27, 0xe00000
 ; GFX9-O0-NEXT:    s_add_u32 s24, s24, s11
 ; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s10
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v7, s10, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v7, s11, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s10, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s11, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s13, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s12, s8
 ; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX9-O0-NEXT:    v_writelane_b32 v7, s4, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v7, s5, 3
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s5, 3
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v7, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v7, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v3, 1
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v7, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v7, 3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v3, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v3, 3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
@@ -1241,23 +1201,19 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    s_mov_b32 s17, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s3
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s19, 7
 ; GFX9-O0-NEXT:    s_mov_b32 s3, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s2
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s3, 8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s3
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 9
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 10
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s2, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s3, 10
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 56
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -1273,34 +1229,28 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[24:25]
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[26:27]
 ; GFX9-O0-NEXT:    s_mov_b32 s15, 20
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s15, v3
-; GFX9-O0-NEXT:    s_mov_b32 s15, 10
 ; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v4, s15, v4
-; GFX9-O0-NEXT:    v_or3_b32 v3, v5, v4, v3
+; GFX9-O0-NEXT:    s_mov_b32 s15, 10
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v5, s15, v5
+; GFX9-O0-NEXT:    v_or3_b32 v4, v6, v5, v4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 9
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 10
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    v_add_u32_e64 v3, v3, v6
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v3, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v3, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v3, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v3, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v3, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v3, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v3, 8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, v7
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s4 offset:4
-; GFX9-O0-NEXT:    ; kill: killed $vgpr0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s4 offset:4
+; GFX9-O0-NEXT:    ; kill: killed $vgpr3
 ; GFX9-O0-NEXT:    s_endpgm
 ;
 ; GFX9-O3-LABEL: strict_wwm_call:
@@ -1327,10 +1277,10 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O3-NEXT:    s_mov_b64 s[2:3], s[14:15]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-O3-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[10:11]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
 ; GFX9-O3-NEXT:    s_endpgm
 
@@ -1430,38 +1380,32 @@ define i64 @strict_wwm_called_i64(i64 %a) noinline {
 define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) {
 ; GFX9-O0-LABEL: strict_wwm_call_i64:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_mov_b32 s32, 0x400
+; GFX9-O0-NEXT:    s_mov_b32 s32, 0
 ; GFX9-O0-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; GFX9-O0-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
 ; GFX9-O0-NEXT:    s_mov_b32 s26, -1
 ; GFX9-O0-NEXT:    s_mov_b32 s27, 0xe00000
 ; GFX9-O0-NEXT:    s_add_u32 s24, s24, s11
 ; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr12 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s10
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v12, s10, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v12, s11, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s10, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s11, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s13, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s12, s8
 ; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX9-O0-NEXT:    v_writelane_b32 v12, s4, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v12, s5, 3
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s5, 3
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v12, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v12, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v8, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v8, 1
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v12, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v12, 3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v8, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v8, 3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v0
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x2c
@@ -1475,24 +1419,20 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    s_mov_b32 s17, s15
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s8
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s19, 7
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s3
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s9
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 9
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s2, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s3, 9
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 60
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -1502,11 +1442,10 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    s_addc_u32 s0, s0, s1
 ; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
 ; GFX9-O0-NEXT:    s_mov_b32 s0, 32
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[10:11], s0, v[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[15:16], s0, v[12:13]
 ; GFX9-O0-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-O0-NEXT:    s_add_u32 s0, s0, strict_wwm_called_i64 at gotpcrel32@lo+4
 ; GFX9-O0-NEXT:    s_addc_u32 s1, s1, strict_wwm_called_i64 at gotpcrel32@hi+12
@@ -1514,43 +1453,36 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[24:25]
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[26:27]
 ; GFX9-O0-NEXT:    s_mov_b32 s15, 20
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s15, v3
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v9, s15, v9
 ; GFX9-O0-NEXT:    s_mov_b32 s15, 10
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v4, s15, v4
-; GFX9-O0-NEXT:    v_or3_b32 v3, v5, v4, v3
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v10, s15, v10
+; GFX9-O0-NEXT:    v_or3_b32 v9, v11, v10, v9
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v15
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v8, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v8, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v8, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v8, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v3, s[6:7], v3, v5
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v4, s[6:7], v4, v6, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v13
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v9, s[6:7], v9, v11
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v10, s[6:7], v10, v12, s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v10
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0
-; GFX9-O0-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], s4 offset:4
-; GFX9-O0-NEXT:    ; kill: killed $vgpr0
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], s4 offset:4
+; GFX9-O0-NEXT:    ; kill: killed $vgpr8
 ; GFX9-O0-NEXT:    s_endpgm
 ;
 ; GFX9-O3-LABEL: strict_wwm_call_i64:
@@ -1583,13 +1515,13 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v6, vcc, v8, v6
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v7, vcc, v9, v7, vcc
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0 offset:4
 ; GFX9-O3-NEXT:    s_endpgm
 

>From 24a245e1282738482d9ae2e94d7d648116f5bee1 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Wed, 1 Nov 2023 11:58:50 +0900
Subject: [PATCH 2/3] Skip UsedPhysRegMask test when calling isPhysRegUsed. We
 are not concerned about clobbers from calls as these will be spilt correctly
 -- and these were never considered before.

---
 .../Target/AMDGPU/SIPreAllocateWWMRegs.cpp    |    2 +-
 .../AMDGPU/global_atomics_scan_fadd.ll        | 1152 ++++++++--------
 .../AMDGPU/global_atomics_scan_fmax.ll        |  927 ++++++-------
 .../AMDGPU/global_atomics_scan_fmin.ll        |  927 ++++++-------
 .../AMDGPU/global_atomics_scan_fsub.ll        | 1200 ++++++++---------
 .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll |   99 +-
 llvm/test/CodeGen/AMDGPU/wwm-reserved.ll      |  130 +-
 7 files changed, 2226 insertions(+), 2211 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index fc35bec0edd3bdb..dac36e061a0d5bb 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -99,7 +99,7 @@ bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) {
   LiveInterval &LI = LIS->getInterval(Reg);
 
   for (MCRegister PhysReg : RegClassInfo.getOrder(MRI->getRegClass(Reg))) {
-    if (!MRI->isPhysRegUsed(PhysReg) &&
+    if (!MRI->isPhysRegUsed(PhysReg, true) &&
         Matrix->checkInterference(LI, PhysReg) == LiveRegMatrix::IK_Free) {
       Matrix->assign(LI, PhysReg);
       assert(PhysReg != 0);
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 6844bebb8ab7623..4cbd5e84871cc75 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -725,36 +725,36 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -809,50 +809,50 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 32
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_add_f32_e64 v40, s2, s3
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
 ; GFX1064-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -892,44 +892,44 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -959,53 +959,53 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_atomic_add_f32 v1, v0, s[0:1]
+; GFX1164-DPP-NEXT:    global_atomic_add_f32 v4, v0, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB1_2:
 ; GFX1164-DPP-NEXT:    s_nop 0
 ; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1031,45 +1031,45 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_atomic_add_f32 v1, v0, s[0:1]
+; GFX1132-DPP-NEXT:    global_atomic_add_f32 v4, v0, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB1_2:
 ; GFX1132-DPP-NEXT:    s_nop 0
 ; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1953,36 +1953,36 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -2037,50 +2037,50 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 32
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_add_f32_e64 v40, s2, s3
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
 ; GFX1064-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -2120,44 +2120,44 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -2187,62 +2187,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v4, v5, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -2270,54 +2270,54 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v4, v5, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
@@ -3181,36 +3181,36 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3265,50 +3265,50 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 32
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_add_f32_e64 v40, s2, s3
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
 ; GFX1064-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -3348,44 +3348,44 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -3415,53 +3415,53 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_atomic_add_f32 v1, v0, s[0:1]
+; GFX1164-DPP-NEXT:    global_atomic_add_f32 v4, v0, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB5_2:
 ; GFX1164-DPP-NEXT:    s_nop 0
 ; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3487,45 +3487,45 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_atomic_add_f32 v1, v0, s[0:1]
+; GFX1132-DPP-NEXT:    global_atomic_add_f32 v4, v0, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB5_2:
 ; GFX1132-DPP-NEXT:    s_nop 0
 ; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3929,36 +3929,36 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -4013,50 +4013,50 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 32
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_add_f32_e64 v40, s2, s3
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
 ; GFX1064-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -4096,44 +4096,44 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -4163,53 +4163,53 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_atomic_add_f32 v1, v0, s[0:1]
+; GFX1164-DPP-NEXT:    global_atomic_add_f32 v4, v0, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB6_2:
 ; GFX1164-DPP-NEXT:    s_nop 0
 ; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4235,45 +4235,45 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_atomic_add_f32 v1, v0, s[0:1]
+; GFX1132-DPP-NEXT:    global_atomic_add_f32 v4, v0, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB6_2:
 ; GFX1132-DPP-NEXT:    s_nop 0
 ; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5156,36 +5156,36 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -5240,50 +5240,50 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 32
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_add_f32_e64 v40, s2, s3
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
 ; GFX1064-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -5323,44 +5323,44 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -5390,62 +5390,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v4, v5, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -5473,54 +5473,54 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v4, v5, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 29f6ce7a8331ccd..e07514b063ee481 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -787,43 +787,43 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -832,14 +832,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT:    v_max_f32_e64 v3, s4, s4
+; GFX9-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX9-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
 ; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -880,51 +880,51 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0xff800000
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 32
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v40, s2, s2
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v41, s3, s3
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 0
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v3, s2, s2
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v4, s3, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
@@ -932,7 +932,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
 ; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -973,36 +973,36 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0xff800000
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -1010,14 +1010,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
 ; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -1048,75 +1048,75 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0xff800000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0xff800000
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -1144,62 +1144,63 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0xff800000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0xff800000
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v41, v41 :: v_dual_mov_b32 v42, 0xff800000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0xff800000
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0xff800000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0xff800000
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_max_f32 v3, v0, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
@@ -1985,43 +1986,43 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -2030,14 +2031,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT:    v_max_f32_e64 v3, s4, s4
+; GFX9-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX9-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
 ; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -2078,51 +2079,51 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0xff800000
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 32
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v40, s2, s2
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v41, s3, s3
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 0
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v3, s2, s2
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v4, s3, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
@@ -2130,7 +2131,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
 ; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -2171,36 +2172,36 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0xff800000
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -2208,14 +2209,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
 ; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -2246,75 +2247,75 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0xff800000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0xff800000
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -2342,62 +2343,63 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0xff800000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0xff800000
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v41, v41 :: v_dual_mov_b32 v42, 0xff800000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0xff800000
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0xff800000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0xff800000
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_max_f32 v3, v0, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
@@ -3183,43 +3185,43 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v5
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3228,14 +3230,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT:    v_max_f32_e64 v3, s4, s4
+; GFX9-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX9-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
 ; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -3276,51 +3278,51 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0xff800000
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 32
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v40, s2, s2
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v41, s3, s3
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 0
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v3, s2, s2
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v4, s3, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
@@ -3328,7 +3330,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
 ; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -3369,36 +3371,36 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0xff800000
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0xff800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -3406,14 +3408,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
 ; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -3444,75 +3446,75 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0xff800000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0xff800000
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0xff800000
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0xff800000
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -3540,62 +3542,63 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, 0xff800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0xff800000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, 0xff800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0xff800000
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v41, v41 :: v_dual_mov_b32 v42, 0xff800000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0xff800000
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v41, v40
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0xff800000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0xff800000
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_max_f32 v3, v0, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index a760f963b54eca4..ad6edbd2c37a09d 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -787,43 +787,43 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -832,14 +832,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT:    v_max_f32_e64 v3, s4, s4
+; GFX9-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX9-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
 ; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -880,51 +880,51 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 32
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v40, s2, s2
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v41, s3, s3
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 0
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v3, s2, s2
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v4, s3, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
@@ -932,7 +932,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
 ; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -973,36 +973,36 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -1010,14 +1010,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
 ; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -1048,75 +1048,75 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -1144,62 +1144,63 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v41, v41 :: v_dual_mov_b32 v42, 0x7f800000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7f800000
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0x7f800000
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0x7f800000
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_max_f32 v3, v0, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
@@ -1985,43 +1986,43 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -2030,14 +2031,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT:    v_max_f32_e64 v3, s4, s4
+; GFX9-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX9-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
 ; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -2078,51 +2079,51 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 32
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v40, s2, s2
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v41, s3, s3
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 0
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v3, s2, s2
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v4, s3, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
@@ -2130,7 +2131,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
 ; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -2171,36 +2172,36 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -2208,14 +2209,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
 ; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -2246,75 +2247,75 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -2342,62 +2343,63 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v41, v41 :: v_dual_mov_b32 v42, 0x7f800000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7f800000
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0x7f800000
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0x7f800000
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_max_f32 v3, v0, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
@@ -3183,43 +3185,43 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v5
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX9-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3228,14 +3230,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT:    v_max_f32_e64 v3, s4, s4
+; GFX9-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX9-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
 ; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -3276,51 +3278,51 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 32
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v40, s2, s2
-; GFX1064-DPP-NEXT:    v_max_f32_e64 v41, s3, s3
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 32
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 0
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v3, s2, s2
+; GFX1064-DPP-NEXT:    v_max_f32_e64 v4, s3, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
@@ -3328,7 +3330,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
 ; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
@@ -3369,36 +3371,36 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -3406,14 +3408,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
 ; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
@@ -3444,75 +3446,75 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v42, 0x7f800000
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -3540,62 +3542,63 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, 0x7f800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, 0x7f800000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v41, v41 :: v_dual_mov_b32 v42, 0x7f800000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7f800000
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v41, v40
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0x7f800000
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v41, v42, v42 :: v_dual_mov_b32 v42, 0x7f800000
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v42, v42
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v40
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_max_f32 v3, v0, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v1, v2, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index b6f8c9392601318..4a00d7bc71bca8a 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -799,36 +799,36 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -883,50 +883,50 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 32
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_add_f32_e64 v40, s2, s3
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
 ; GFX1064-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -966,44 +966,44 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -1033,62 +1033,62 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v4, v5, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -1116,54 +1116,54 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v4, v5, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
@@ -2049,36 +2049,36 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -2133,50 +2133,50 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 32
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_add_f32_e64 v40, s2, s3
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
 ; GFX1064-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -2216,44 +2216,44 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -2283,62 +2283,62 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v4, v5, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -2366,54 +2366,54 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v4, v5, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
@@ -3299,36 +3299,36 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -3383,50 +3383,50 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 32
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_add_f32_e64 v40, s2, s3
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
 ; GFX1064-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -3466,44 +3466,44 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -3533,62 +3533,62 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v4, v5, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -3616,54 +3616,54 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v4, v5, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
@@ -4091,36 +4091,36 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -4175,50 +4175,50 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 32
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_add_f32_e64 v40, s2, s3
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
 ; GFX1064-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -4258,44 +4258,44 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -4325,62 +4325,62 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v4, v5, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -4408,54 +4408,54 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v4, v5, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
@@ -5340,36 +5340,36 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX9-DPP-NEXT:    v_readlane_b32 s4, v40, 63
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -5424,50 +5424,50 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v40, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v40, 32
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_add_f32_e64 v40, s2, s3
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
 ; GFX1064-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -5507,44 +5507,44 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
 ; GFX1032-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
@@ -5574,62 +5574,62 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v41, v40
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v41, v40
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1164-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v4, v5, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -5657,54 +5657,54 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v40, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v40, v41 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v41, v40
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v41, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v42, 1
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v41, v40 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v42, v40 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v42
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v41, v40
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v41, v41, -1, -1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v40, v40, v41
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v40
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB8_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1132-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v4, v5, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index b1aaae39af75263..33b640a177c1b75 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -373,7 +373,7 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x400
 ; GFX9-O0-NEXT:    v_writelane_b32 v2, s30, 0
@@ -407,17 +407,17 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[46:47]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[42:43]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-O0-NEXT:    v_add_u32_e64 v3, v4, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v3
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[36:39], s34 offset:4
 ; GFX9-O0-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX9-O0-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffffc00
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s48
@@ -432,7 +432,7 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_writelane_b32 v2, s30, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x400
@@ -447,17 +447,17 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O3-NEXT:    s_add_u32 s36, s36, strict_wwm_called at rel32@lo+4
 ; GFX9-O3-NEXT:    s_addc_u32 s37, s37, strict_wwm_called at rel32@hi+12
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-O3-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v3
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
 ; GFX9-O3-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX9-O3-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s38
@@ -567,16 +567,16 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x1000
 ; GFX9-O0-NEXT:    ; implicit-def: $vgpr11 : SGPR spill to VGPR lane
@@ -614,10 +614,10 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    v_writelane_b32 v11, s34, 4
 ; GFX9-O0-NEXT:    v_writelane_b32 v11, s35, 5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 32
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[13:14], s34, v[9:10]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s34, v[9:10]
 ; GFX9-O0-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-O0-NEXT:    s_add_u32 s34, s34, strict_wwm_called_i64 at gotpcrel32@lo+4
 ; GFX9-O0-NEXT:    s_addc_u32 s35, s35, strict_wwm_called_i64 at gotpcrel32@hi+12
@@ -626,8 +626,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    s_mov_b64 s[36:37], s[0:1]
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-O0-NEXT:    v_readlane_b32 s34, v11, 4
@@ -636,15 +636,17 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    v_readlane_b32 s37, v11, 1
 ; GFX9-O0-NEXT:    v_readlane_b32 s38, v11, 2
 ; GFX9-O0-NEXT:    v_readlane_b32 s39, v11, 3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v9, s[40:41], v12, v9
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v10, s[40:41], v13, v10, s[40:41]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v2, s[40:41], v2, v4
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v3, s[40:41], v3, v5, s[40:41]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
 ; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4
 ; GFX9-O0-NEXT:    v_readlane_b32 s31, v8, 1
@@ -655,14 +657,14 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffff000
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s46
@@ -679,8 +681,11 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_writelane_b32 v6, s30, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x800
@@ -702,13 +707,13 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v1
-; GFX9-O3-NEXT:    v_add_co_u32_e32 v7, vcc, v9, v7
-; GFX9-O3-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v8, vcc
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[38:39]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v7
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v8
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
 ; GFX9-O3-NEXT:    v_readlane_b32 s31, v6, 1
 ; GFX9-O3-NEXT:    v_readlane_b32 s30, v6, 0
@@ -716,8 +721,10 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s40
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 8e90bdec5cb111b..835835103473f72 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -443,10 +443,10 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O3-NEXT:    s_mov_b64 s[2:3], s[14:15]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-O3-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[10:11]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
 ; GFX9-O3-NEXT:    s_endpgm
 
@@ -569,9 +569,9 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v8, 2
 ; GFX9-O0-NEXT:    v_readlane_b32 s1, v8, 3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x2c
@@ -590,11 +590,11 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    v_writelane_b32 v8, s18, 6
 ; GFX9-O0-NEXT:    v_writelane_b32 v8, s19, 7
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s3
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s9
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GFX9-O0-NEXT:    v_writelane_b32 v8, s2, 8
@@ -608,10 +608,11 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    s_addc_u32 s0, s0, s1
 ; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
 ; GFX9-O0-NEXT:    s_mov_b32 s0, 32
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[15:16], s0, v[12:13]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[11:12], s0, v[9:10]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
 ; GFX9-O0-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-O0-NEXT:    s_add_u32 s0, s0, called_i64 at gotpcrel32@lo+4
 ; GFX9-O0-NEXT:    s_addc_u32 s1, s1, called_i64 at gotpcrel32@hi+12
@@ -619,14 +620,14 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[24:25]
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[26:27]
 ; GFX9-O0-NEXT:    s_mov_b32 s15, 20
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v9, s15, v9
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s15, v3
 ; GFX9-O0-NEXT:    s_mov_b32 s15, 10
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v10, s15, v10
-; GFX9-O0-NEXT:    v_or3_b32 v9, v11, v10, v9
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v4, s15, v4
+; GFX9-O0-NEXT:    v_or3_b32 v3, v5, v4, v3
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v8, 4
@@ -635,17 +636,17 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    v_readlane_b32 s3, v8, 7
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 8
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v13
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v9, s[6:7], v9, v11
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v10, s[6:7], v10, v12, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v3, s[6:7], v3, v5
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v4, s[6:7], v4, v6, s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0
 ; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], s4 offset:4
 ; GFX9-O0-NEXT:    ; kill: killed $vgpr8
@@ -681,13 +682,13 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v1
-; GFX9-O3-NEXT:    v_add_co_u32_e32 v6, vcc, v8, v6
-; GFX9-O3-NEXT:    v_addc_co_u32_e32 v7, vcc, v9, v7, vcc
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0 offset:4
 ; GFX9-O3-NEXT:    s_endpgm
 
@@ -1277,10 +1278,10 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O3-NEXT:    s_mov_b64 s[2:3], s[14:15]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-O3-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[10:11]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
 ; GFX9-O3-NEXT:    s_endpgm
 
@@ -1403,9 +1404,9 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v8, 2
 ; GFX9-O0-NEXT:    v_readlane_b32 s1, v8, 3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x2c
@@ -1424,11 +1425,11 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    v_writelane_b32 v8, s18, 6
 ; GFX9-O0-NEXT:    v_writelane_b32 v8, s19, 7
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s3
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s9
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; GFX9-O0-NEXT:    v_writelane_b32 v8, s2, 8
@@ -1442,10 +1443,11 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    s_addc_u32 s0, s0, s1
 ; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
 ; GFX9-O0-NEXT:    s_mov_b32 s0, 32
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[15:16], s0, v[12:13]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[11:12], s0, v[9:10]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
 ; GFX9-O0-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-O0-NEXT:    s_add_u32 s0, s0, strict_wwm_called_i64 at gotpcrel32@lo+4
 ; GFX9-O0-NEXT:    s_addc_u32 s1, s1, strict_wwm_called_i64 at gotpcrel32@hi+12
@@ -1453,14 +1455,14 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[24:25]
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[26:27]
 ; GFX9-O0-NEXT:    s_mov_b32 s15, 20
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v9, s15, v9
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s15, v3
 ; GFX9-O0-NEXT:    s_mov_b32 s15, 10
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v10, s15, v10
-; GFX9-O0-NEXT:    v_or3_b32 v9, v11, v10, v9
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v4, s15, v4
+; GFX9-O0-NEXT:    v_or3_b32 v3, v5, v4, v3
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-O0-NEXT:    v_readlane_b32 s0, v8, 4
@@ -1469,17 +1471,17 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    v_readlane_b32 s3, v8, 7
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 8
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v13
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v9, s[6:7], v9, v11
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v10, s[6:7], v10, v12, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v3, s[6:7], v3, v5
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v4, s[6:7], v4, v6, s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0
 ; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], s4 offset:4
 ; GFX9-O0-NEXT:    ; kill: killed $vgpr8
@@ -1515,13 +1517,13 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v1
-; GFX9-O3-NEXT:    v_add_co_u32_e32 v6, vcc, v8, v6
-; GFX9-O3-NEXT:    v_addc_co_u32_e32 v7, vcc, v9, v7, vcc
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0 offset:4
 ; GFX9-O3-NEXT:    s_endpgm
 

>From 8f454eba1d8b733d845b9e19074751908b847f9d Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Thu, 2 Nov 2023 18:07:57 +0900
Subject: [PATCH 3/3] Ignore AMDGPU::SI_SPILL_S32_TO_VGPR in STRICT_WWM
 regions.

---
 .../Target/AMDGPU/SIPreAllocateWWMRegs.cpp    |   3 +
 .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll |  88 +++--
 llvm/test/CodeGen/AMDGPU/wwm-reserved.ll      | 366 +++++++++++-------
 3 files changed, 269 insertions(+), 188 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index dac36e061a0d5bb..ea66a270380482c 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -215,6 +215,9 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
           MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64)
         RegsAssigned |= processDef(MI.getOperand(0));
 
+      if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR)
+        continue;
+
       if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM ||
           MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM ||
           MI.getOpcode() == AMDGPU::ENTER_PSEUDO_WM) {
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index 33b640a177c1b75..ab7d3ca0ab425f9 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -559,27 +559,28 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-LABEL: strict_wwm_call_i64:
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s46, s33
+; GFX9-O0-NEXT:    s_mov_b32 s48, s33
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x1000
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr11 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    v_writelane_b32 v8, s30, 0
 ; GFX9-O0-NEXT:    v_writelane_b32 v8, s31, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s8
@@ -597,10 +598,10 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    s_mov_b32 s41, s45
 ; GFX9-O0-NEXT:    s_mov_b32 s42, s44
 ; GFX9-O0-NEXT:    s_mov_b32 s43, s35
-; GFX9-O0-NEXT:    v_writelane_b32 v11, s40, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v11, s41, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v11, s42, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v11, s43, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s40, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s41, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s42, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s43, 3
 ; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    s_mov_b32 s35, s9
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35
@@ -612,8 +613,11 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s37
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v11, s34, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v11, s35, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s34, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s35, 5
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 32
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
@@ -630,13 +634,20 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-O0-NEXT:    v_readlane_b32 s34, v11, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s35, v11, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s36, v11, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s37, v11, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s38, v11, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s39, v11, 3
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s34, v6, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s35, v6, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s36, v6, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s37, v6, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s38, v6, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s39, v6, 3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
@@ -645,29 +656,30 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    v_add_co_u32_e64 v2, s[40:41], v2, v4
 ; GFX9-O0-NEXT:    v_addc_co_u32_e64 v3, s[40:41], v3, v5, s[40:41]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
-; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[6:7], off, s[36:39], s34 offset:4
 ; GFX9-O0-NEXT:    v_readlane_b32 s31, v8, 1
 ; GFX9-O0-NEXT:    v_readlane_b32 s30, v8, 0
-; GFX9-O0-NEXT:    ; kill: killed $vgpr11
+; GFX9-O0-NEXT:    ; kill: killed $vgpr0
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffff000
-; GFX9-O0-NEXT:    s_mov_b32 s33, s46
+; GFX9-O0-NEXT:    s_mov_b32 s33, s48
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 835835103473f72..c134cf7ea3aaa40 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -328,32 +328,38 @@ define hidden i32 @called(i32 %a) noinline {
 define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-LABEL: call:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_mov_b32 s32, 0
+; GFX9-O0-NEXT:    s_mov_b32 s32, 0x400
 ; GFX9-O0-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; GFX9-O0-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
 ; GFX9-O0-NEXT:    s_mov_b32 s26, -1
 ; GFX9-O0-NEXT:    s_mov_b32 s27, 0xe00000
 ; GFX9-O0-NEXT:    s_add_u32 s24, s24, s11
 ; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s10
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s10, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s11, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v7, s10, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v7, s11, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s13, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s12, s8
 ; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s4, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s5, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v7, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v7, s5, 3
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v3, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v3, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v7, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v7, 1
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v3, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v3, 3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v7, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v7, 3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
@@ -367,19 +373,23 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    s_mov_b32 s17, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s3
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s16, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s17, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s18, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s19, 7
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 7
 ; GFX9-O0-NEXT:    s_mov_b32 s3, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s3, 8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s2
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s2
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s3
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s2, 9
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s3, 10
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 10
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 56
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -395,28 +405,34 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[24:25]
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[26:27]
 ; GFX9-O0-NEXT:    s_mov_b32 s15, 20
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v4, s15, v4
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s15, v3
 ; GFX9-O0-NEXT:    s_mov_b32 s15, 10
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v5, s15, v5
-; GFX9-O0-NEXT:    v_or3_b32 v4, v6, v5, v4
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v4, s15, v4
+; GFX9-O0-NEXT:    v_or3_b32 v3, v5, v4, v3
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v3, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v3, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v3, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v3, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v3, 9
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v3, 10
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v3, 8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, v7
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_add_u32_e64 v3, v3, v6
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s4 offset:4
-; GFX9-O0-NEXT:    ; kill: killed $vgpr3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s4 offset:4
+; GFX9-O0-NEXT:    ; kill: killed $vgpr0
 ; GFX9-O0-NEXT:    s_endpgm
 ;
 ; GFX9-O3-LABEL: call:
@@ -546,32 +562,38 @@ define i64 @called_i64(i64 %a) noinline {
 define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) {
 ; GFX9-O0-LABEL: call_i64:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_mov_b32 s32, 0
+; GFX9-O0-NEXT:    s_mov_b32 s32, 0x400
 ; GFX9-O0-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; GFX9-O0-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
 ; GFX9-O0-NEXT:    s_mov_b32 s26, -1
 ; GFX9-O0-NEXT:    s_mov_b32 s27, 0xe00000
 ; GFX9-O0-NEXT:    s_add_u32 s24, s24, s11
 ; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr12 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s10
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s10, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s11, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v12, s10, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v12, s11, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s13, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s12, s8
 ; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s4, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s5, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v12, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v12, s5, 3
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v8, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v8, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v12, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v12, 1
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v8, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v8, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v12, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v12, 3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x2c
@@ -585,20 +607,24 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    s_mov_b32 s17, s15
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s8
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s16, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s17, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s18, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s19, 7
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 7
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s3
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s9
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s2, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s3, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 60
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -608,11 +634,11 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    s_addc_u32 s0, s0, s1
 ; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
 ; GFX9-O0-NEXT:    s_mov_b32 s0, 32
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[11:12], s0, v[9:10]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[10:11], s0, v[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
 ; GFX9-O0-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-O0-NEXT:    s_add_u32 s0, s0, called_i64 at gotpcrel32@lo+4
 ; GFX9-O0-NEXT:    s_addc_u32 s1, s1, called_i64 at gotpcrel32@hi+12
@@ -630,26 +656,33 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v8, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v8, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v8, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v8, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
 ; GFX9-O0-NEXT:    v_add_co_u32_e64 v3, s[6:7], v3, v5
 ; GFX9-O0-NEXT:    v_addc_co_u32_e64 v4, s[6:7], v4, v6, s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0
-; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], s4 offset:4
-; GFX9-O0-NEXT:    ; kill: killed $vgpr8
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], s4 offset:4
+; GFX9-O0-NEXT:    ; kill: killed $vgpr0
 ; GFX9-O0-NEXT:    s_endpgm
 ;
 ; GFX9-O3-LABEL: call_i64:
@@ -1163,32 +1196,38 @@ define hidden i32 @strict_wwm_called(i32 %a) noinline {
 define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-LABEL: strict_wwm_call:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_mov_b32 s32, 0
+; GFX9-O0-NEXT:    s_mov_b32 s32, 0x400
 ; GFX9-O0-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; GFX9-O0-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
 ; GFX9-O0-NEXT:    s_mov_b32 s26, -1
 ; GFX9-O0-NEXT:    s_mov_b32 s27, 0xe00000
 ; GFX9-O0-NEXT:    s_add_u32 s24, s24, s11
 ; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s10
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s10, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s11, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v7, s10, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v7, s11, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s13, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s12, s8
 ; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s4, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s5, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v7, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v7, s5, 3
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v3, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v3, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v7, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v7, 1
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v3, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v3, 3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v7, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v7, 3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
@@ -1202,19 +1241,23 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    s_mov_b32 s17, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s3
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s16, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s17, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s18, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s19, 7
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 7
 ; GFX9-O0-NEXT:    s_mov_b32 s3, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s3, 8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s2
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s2
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s3
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s2, 9
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s3, 10
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 10
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 56
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -1230,28 +1273,34 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[24:25]
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[26:27]
 ; GFX9-O0-NEXT:    s_mov_b32 s15, 20
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v4, s15, v4
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s15, v3
 ; GFX9-O0-NEXT:    s_mov_b32 s15, 10
-; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v5, s15, v5
-; GFX9-O0-NEXT:    v_or3_b32 v4, v6, v5, v4
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v4, s15, v4
+; GFX9-O0-NEXT:    v_or3_b32 v3, v5, v4, v3
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v3, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v3, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v3, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v3, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v3, 9
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v3, 10
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v3, 8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, v7
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_add_u32_e64 v3, v3, v6
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s4 offset:4
-; GFX9-O0-NEXT:    ; kill: killed $vgpr3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s4 offset:4
+; GFX9-O0-NEXT:    ; kill: killed $vgpr0
 ; GFX9-O0-NEXT:    s_endpgm
 ;
 ; GFX9-O3-LABEL: strict_wwm_call:
@@ -1381,32 +1430,38 @@ define i64 @strict_wwm_called_i64(i64 %a) noinline {
 define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) {
 ; GFX9-O0-LABEL: strict_wwm_call_i64:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_mov_b32 s32, 0
+; GFX9-O0-NEXT:    s_mov_b32 s32, 0x400
 ; GFX9-O0-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
 ; GFX9-O0-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
 ; GFX9-O0-NEXT:    s_mov_b32 s26, -1
 ; GFX9-O0-NEXT:    s_mov_b32 s27, 0xe00000
 ; GFX9-O0-NEXT:    s_add_u32 s24, s24, s11
 ; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr12 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s10
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s10, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s11, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v12, s10, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v12, s11, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s13, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s12, s8
 ; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s4, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s5, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v12, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v12, s5, 3
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v8, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v8, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v12, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v12, 1
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v8, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v8, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v12, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v12, 3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x2c
@@ -1420,20 +1475,24 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    s_mov_b32 s17, s15
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s8
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s16, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s17, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s18, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s19, 7
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 7
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s3
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s9
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s2, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s3, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 60
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -1443,11 +1502,11 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    s_addc_u32 s0, s0, s1
 ; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
 ; GFX9-O0-NEXT:    s_mov_b32 s0, 32
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[11:12], s0, v[9:10]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[10:11], s0, v[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
 ; GFX9-O0-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-O0-NEXT:    s_add_u32 s0, s0, strict_wwm_called_i64 at gotpcrel32@lo+4
 ; GFX9-O0-NEXT:    s_addc_u32 s1, s1, strict_wwm_called_i64 at gotpcrel32@hi+12
@@ -1465,26 +1524,33 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v8, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v8, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v8, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v8, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
 ; GFX9-O0-NEXT:    v_add_co_u32_e64 v3, s[6:7], v3, v5
 ; GFX9-O0-NEXT:    v_addc_co_u32_e64 v4, s[6:7], v4, v6, s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0
-; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], s4 offset:4
-; GFX9-O0-NEXT:    ; kill: killed $vgpr8
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], s4 offset:4
+; GFX9-O0-NEXT:    ; kill: killed $vgpr0
 ; GFX9-O0-NEXT:    s_endpgm
 ;
 ; GFX9-O3-LABEL: strict_wwm_call_i64:



More information about the llvm-commits mailing list