[llvm] 6ef6c95 - [AMDGPU] Reorder atomic optimizer to avoid CAS loop.

Pravin Jagtap via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 30 09:08:13 PDT 2023


Author: Pravin Jagtap
Date: 2023-08-30T12:05:21-04:00
New Revision: 6ef6c954c6dcd34b99897cc00da9bf1b651398b5

URL: https://github.com/llvm/llvm-project/commit/6ef6c954c6dcd34b99897cc00da9bf1b651398b5
DIFF: https://github.com/llvm/llvm-project/commit/6ef6c954c6dcd34b99897cc00da9bf1b651398b5.diff

LOG: [AMDGPU] Reorder atomic optimizer to avoid CAS loop.

Expand-Atomic pass emits the CAS loop for FP operations
which limits the optimizations offered by atomic optimizer.

Moving atomic optimizer before expand-atomics allows
better codegen.

Reviewed By: arsenm, #amdgpu

Differential Revision: https://reviews.llvm.org/D157265

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
    llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
    llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
    llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
    llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
    llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
    llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 43c9f183f17ba0..2e6b29d986af75 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1000,6 +1000,13 @@ void AMDGPUPassConfig::addIRPasses() {
   if (TM.getOptLevel() > CodeGenOpt::None)
     addPass(createInferAddressSpacesPass());
 
+  // Run atomic optimizer before Atomic Expand
+  if ((TM.getTargetTriple().getArch() == Triple::amdgcn) &&
+      (TM.getOptLevel() >= CodeGenOpt::Less) &&
+      (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
+    addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy));
+  }
+
   addPass(createAtomicExpandPass());
 
   if (TM.getOptLevel() > CodeGenOpt::None) {
@@ -1124,11 +1131,6 @@ bool GCNPassConfig::addPreISel() {
   if (TM->getOptLevel() > CodeGenOpt::None)
     addPass(createAMDGPULateCodeGenPreparePass());
 
-  if ((TM->getOptLevel() >= CodeGenOpt::Less) &&
-      (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
-    addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy));
-  }
-
   if (TM->getOptLevel() > CodeGenOpt::None)
     addPass(createSinkingPass());
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
index 9954e67a1c32d3..5516741e57e929 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
@@ -141,7 +141,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1)
 define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, float %data) #0 {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
-  ; GFX90A_GFX940-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
+  ; GFX90A_GFX940-NEXT:   successors: %bb.2(0x40000000), %bb.5(0x40000000)
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
   ; GFX90A_GFX940-NEXT: {{  $}}
   ; GFX90A_GFX940-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
@@ -149,11 +149,11 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX90A_GFX940-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX90A_GFX940-NEXT:   [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
-  ; GFX90A_GFX940-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A_GFX940-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A_GFX940-NEXT:   S_BRANCH %bb.2
   ; GFX90A_GFX940-NEXT: {{  $}}
   ; GFX90A_GFX940-NEXT: bb.2 (%ir-block.5):
-  ; GFX90A_GFX940-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
+  ; GFX90A_GFX940-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
   ; GFX90A_GFX940-NEXT: {{  $}}
   ; GFX90A_GFX940-NEXT:   [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec
   ; GFX90A_GFX940-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0
@@ -196,29 +196,22 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX90A_GFX940-NEXT:   [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec
   ; GFX90A_GFX940-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; GFX90A_GFX940-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec
-  ; GFX90A_GFX940-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A_GFX940-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A_GFX940-NEXT:   S_BRANCH %bb.3
   ; GFX90A_GFX940-NEXT: {{  $}}
-  ; GFX90A_GFX940-NEXT: bb.3 (%ir-block.36):
-  ; GFX90A_GFX940-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX90A_GFX940-NEXT: bb.3 (%ir-block.35):
+  ; GFX90A_GFX940-NEXT:   successors: %bb.4(0x80000000)
   ; GFX90A_GFX940-NEXT: {{  $}}
   ; GFX90A_GFX940-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A_GFX940-NEXT:   GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
-  ; GFX90A_GFX940-NEXT:   S_BRANCH %bb.5
   ; GFX90A_GFX940-NEXT: {{  $}}
   ; GFX90A_GFX940-NEXT: bb.4.Flow:
-  ; GFX90A_GFX940-NEXT:   successors: %bb.6(0x80000000)
-  ; GFX90A_GFX940-NEXT: {{  $}}
-  ; GFX90A_GFX940-NEXT:   SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A_GFX940-NEXT:   S_BRANCH %bb.6
-  ; GFX90A_GFX940-NEXT: {{  $}}
-  ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.38):
-  ; GFX90A_GFX940-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX90A_GFX940-NEXT:   successors: %bb.5(0x80000000)
   ; GFX90A_GFX940-NEXT: {{  $}}
   ; GFX90A_GFX940-NEXT:   SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A_GFX940-NEXT:   S_BRANCH %bb.4
   ; GFX90A_GFX940-NEXT: {{  $}}
-  ; GFX90A_GFX940-NEXT: bb.6 (%ir-block.39):
+  ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.37):
+  ; GFX90A_GFX940-NEXT:   SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
   %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
index 14c058e3d07dc9..5787392a5901ac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
@@ -206,7 +206,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.3
   ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT: bb.3 (%ir-block.39):
+  ; GFX11-NEXT: bb.3 (%ir-block.36):
   ; GFX11-NEXT:   successors: %bb.5(0x80000000)
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -220,7 +220,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.6
   ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT: bb.5 (%ir-block.42):
+  ; GFX11-NEXT: bb.5 (%ir-block.39):
   ; GFX11-NEXT:   successors: %bb.4(0x80000000)
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.3, [[DEF]], %bb.2
@@ -231,7 +231,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY15]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.4
   ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT: bb.6 (%ir-block.50):
+  ; GFX11-NEXT: bb.6 (%ir-block.47):
   ; GFX11-NEXT:   $vgpr0 = COPY [[PHI]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll
index 6805683d46ee38..3a8041f7a37e12 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll
@@ -14,8 +14,7 @@ define amdgpu_ps void @main(i32 %arg) {
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    s_mov_b32 s2, 0
 ; GFX10-NEXT:    s_branch .LBB0_2
-; GFX10-NEXT:  .LBB0_1: ; %Flow
-; GFX10-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; GFX10-NEXT:  .LBB0_1: ; in Loop: Header=BB0_2 Depth=1
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX10-NEXT:    s_and_b32 s0, exec_lo, vcc_lo

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index dca48a5c3c9587..1ebd864e7e03aa 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -41,6 +41,7 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX7-NEXT:  .LBB0_4: ; %Flow
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_wqm_b64 s[4:5], -1
+; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_vccnz .LBB0_6
 ; GFX7-NEXT:  ; %bb.5: ; %if
@@ -75,6 +76,7 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX89-NEXT:  .LBB0_4: ; %Flow
 ; GFX89-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX89-NEXT:    s_wqm_b64 s[4:5], -1
+; GFX89-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
 ; GFX89-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GFX89-NEXT:    s_cbranch_vccnz .LBB0_6
 ; GFX89-NEXT:  ; %bb.5: ; %if
@@ -110,6 +112,7 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1064-NEXT:  .LBB0_4: ; %Flow
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1064-NEXT:    s_wqm_b64 s[4:5], -1
+; GFX1064-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
 ; GFX1064-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GFX1064-NEXT:    s_cbranch_vccnz .LBB0_6
 ; GFX1064-NEXT:  ; %bb.5: ; %if
@@ -144,6 +147,7 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1032-NEXT:  .LBB0_4: ; %Flow
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX1032-NEXT:    s_wqm_b32 s4, -1
+; GFX1032-NEXT:    s_and_b32 s4, s4, s4
 ; GFX1032-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
 ; GFX1032-NEXT:    s_cbranch_vccnz .LBB0_6
 ; GFX1032-NEXT:  ; %bb.5: ; %if
@@ -182,7 +186,8 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1164-NEXT:  .LBB0_4: ; %Flow
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1164-NEXT:    s_wqm_b64 s[4:5], -1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
 ; GFX1164-NEXT:    s_and_not1_b64 vcc, exec, s[4:5]
 ; GFX1164-NEXT:    s_cbranch_vccnz .LBB0_6
 ; GFX1164-NEXT:  ; %bb.5: ; %if
@@ -222,7 +227,8 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1132-NEXT:  .LBB0_4: ; %Flow
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX1132-NEXT:    s_wqm_b32 s4, -1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_b32 s4, s4, s4
 ; GFX1132-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX1132-NEXT:    s_cbranch_vccnz .LBB0_6
 ; GFX1132-NEXT:  ; %bb.5: ; %if
@@ -307,6 +313,7 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX8-NEXT:  .LBB1_4: ; %Flow
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_wqm_b64 s[4:5], -1
+; GFX8-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_vccnz .LBB1_6
 ; GFX8-NEXT:  ; %bb.5: ; %if
@@ -363,6 +370,7 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX9-NEXT:  .LBB1_4: ; %Flow
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX9-NEXT:    s_wqm_b64 s[4:5], -1
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
 ; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_vccnz .LBB1_6
 ; GFX9-NEXT:  ; %bb.5: ; %if
@@ -426,6 +434,7 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1064-NEXT:  .LBB1_4: ; %Flow
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1064-NEXT:    s_wqm_b64 s[4:5], -1
+; GFX1064-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
 ; GFX1064-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GFX1064-NEXT:    s_cbranch_vccnz .LBB1_6
 ; GFX1064-NEXT:  ; %bb.5: ; %if
@@ -480,6 +489,7 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1032-NEXT:  .LBB1_4: ; %Flow
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX1032-NEXT:    s_wqm_b32 s4, -1
+; GFX1032-NEXT:    s_and_b32 s4, s4, s4
 ; GFX1032-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
 ; GFX1032-NEXT:    s_cbranch_vccnz .LBB1_6
 ; GFX1032-NEXT:  ; %bb.5: ; %if
@@ -552,7 +562,8 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1164-NEXT:  .LBB1_4: ; %Flow
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1164-NEXT:    s_wqm_b64 s[4:5], -1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
 ; GFX1164-NEXT:    s_and_not1_b64 vcc, exec, s[4:5]
 ; GFX1164-NEXT:    s_cbranch_vccnz .LBB1_6
 ; GFX1164-NEXT:  ; %bb.5: ; %if
@@ -616,7 +627,8 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1132-NEXT:  .LBB1_4: ; %Flow
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX1132-NEXT:    s_wqm_b32 s4, -1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_b32 s4, s4, s4
 ; GFX1132-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX1132-NEXT:    s_cbranch_vccnz .LBB1_6
 ; GFX1132-NEXT:  ; %bb.5: ; %if

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
index d9d2839635264b..c2b51467edd08d 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
@@ -147,7 +147,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1)
 define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, float %data) #0 {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
-  ; GFX90A_GFX940-NEXT:   successors: %bb.1(0x40000000), %bb.3(0x40000000)
+  ; GFX90A_GFX940-NEXT:   successors: %bb.1(0x40000000), %bb.4(0x40000000)
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
   ; GFX90A_GFX940-NEXT: {{  $}}
   ; GFX90A_GFX940-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -156,11 +156,11 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX90A_GFX940-NEXT:   [[COPY3:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE]]
   ; GFX90A_GFX940-NEXT:   [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
-  ; GFX90A_GFX940-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX90A_GFX940-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX90A_GFX940-NEXT:   S_BRANCH %bb.1
   ; GFX90A_GFX940-NEXT: {{  $}}
   ; GFX90A_GFX940-NEXT: bb.1 (%ir-block.5):
-  ; GFX90A_GFX940-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
+  ; GFX90A_GFX940-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX90A_GFX940-NEXT: {{  $}}
   ; GFX90A_GFX940-NEXT:   [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec
   ; GFX90A_GFX940-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
@@ -188,30 +188,23 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX90A_GFX940-NEXT:   [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_2]]
   ; GFX90A_GFX940-NEXT:   early-clobber %1:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec
   ; GFX90A_GFX940-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
-  ; GFX90A_GFX940-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX90A_GFX940-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX90A_GFX940-NEXT:   S_BRANCH %bb.2
   ; GFX90A_GFX940-NEXT: {{  $}}
-  ; GFX90A_GFX940-NEXT: bb.2 (%ir-block.36):
-  ; GFX90A_GFX940-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX90A_GFX940-NEXT: bb.2 (%ir-block.35):
+  ; GFX90A_GFX940-NEXT:   successors: %bb.3(0x80000000)
   ; GFX90A_GFX940-NEXT: {{  $}}
   ; GFX90A_GFX940-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A_GFX940-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY %1
   ; GFX90A_GFX940-NEXT:   GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_1]], [[COPY8]], [[COPY3]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
-  ; GFX90A_GFX940-NEXT:   S_BRANCH %bb.4
   ; GFX90A_GFX940-NEXT: {{  $}}
   ; GFX90A_GFX940-NEXT: bb.3.Flow:
-  ; GFX90A_GFX940-NEXT:   successors: %bb.5(0x80000000)
-  ; GFX90A_GFX940-NEXT: {{  $}}
-  ; GFX90A_GFX940-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX90A_GFX940-NEXT:   S_BRANCH %bb.5
-  ; GFX90A_GFX940-NEXT: {{  $}}
-  ; GFX90A_GFX940-NEXT: bb.4 (%ir-block.38):
-  ; GFX90A_GFX940-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX90A_GFX940-NEXT:   successors: %bb.4(0x80000000)
   ; GFX90A_GFX940-NEXT: {{  $}}
   ; GFX90A_GFX940-NEXT:   SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX90A_GFX940-NEXT:   S_BRANCH %bb.3
   ; GFX90A_GFX940-NEXT: {{  $}}
-  ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.39):
+  ; GFX90A_GFX940-NEXT: bb.4 (%ir-block.37):
+  ; GFX90A_GFX940-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
   %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
index a545c024fb9eee..c559750d4f1b46 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
@@ -203,7 +203,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.2
   ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT: bb.2 (%ir-block.39):
+  ; GFX11-NEXT: bb.2 (%ir-block.36):
   ; GFX11-NEXT:   successors: %bb.4(0x80000000)
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -218,7 +218,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.5
   ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT: bb.4 (%ir-block.42):
+  ; GFX11-NEXT: bb.4 (%ir-block.39):
   ; GFX11-NEXT:   successors: %bb.3(0x80000000)
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
@@ -228,7 +228,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.3
   ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT: bb.5 (%ir-block.50):
+  ; GFX11-NEXT: bb.5 (%ir-block.47):
   ; GFX11-NEXT:   $vgpr0 = COPY [[PHI]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
   %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
index debcc67b29a5d5..0499f83de1c92a 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
@@ -8,138 +8,216 @@
 define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0 {
 ; GFX900-LABEL: global_atomic_fadd_ret_f32:
 ; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_mov_b64 s[4:5], exec
+; GFX900-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX900-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT:    ; implicit-def: $vgpr1
+; GFX900-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX900-NEXT:    s_cbranch_execz .LBB0_4
+; GFX900-NEXT:  ; %bb.1:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX900-NEXT:    s_mov_b64 s[2:3], 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    s_bcnt1_i32_b64 s7, s[4:5]
+; GFX900-NEXT:    v_cvt_f32_ubyte0_e32 v1, s7
+; GFX900-NEXT:    s_mov_b64 s[4:5], 0
+; GFX900-NEXT:    v_mul_f32_e32 v2, 4.0, v1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v1, s4
-; GFX900-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX900-NEXT:    v_mov_b32_e32 v1, s6
+; GFX900-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX900-NEXT:    v_mov_b32_e32 v5, v1
+; GFX900-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX900-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_wbinvl1_vol
-; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX900-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX900-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX900-NEXT:  ; %bb.3: ; %Flow
+; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT:  .LBB0_4: ; %Flow1
 ; GFX900-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX900-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX900-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX900-NEXT:    v_mad_f32 v0, v0, 4.0, s0
+; GFX900-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX908-LABEL: global_atomic_fadd_ret_f32:
 ; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_mov_b64 s[4:5], exec
+; GFX908-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX908-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX908-NEXT:    ; implicit-def: $vgpr1
+; GFX908-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX908-NEXT:    s_cbranch_execz .LBB0_4
+; GFX908-NEXT:  ; %bb.1:
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX908-NEXT:    s_mov_b64 s[2:3], 0
-; GFX908-NEXT:    v_mov_b32_e32 v0, 0
+; GFX908-NEXT:    s_bcnt1_i32_b64 s7, s[4:5]
+; GFX908-NEXT:    v_cvt_f32_ubyte0_e32 v1, s7
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mul_f32_e32 v2, 4.0, v1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX908-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX908-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, s4
-; GFX908-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX908-NEXT:    v_mov_b32_e32 v1, s6
+; GFX908-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_mov_b32_e32 v2, v1
-; GFX908-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX908-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1_vol
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX908-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX908-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX908-NEXT:  ; %bb.3: ; %Flow
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:  .LBB0_4: ; %Flow1
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX908-NEXT:    global_store_dword v[0:1], v1, off
+; GFX908-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX908-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX908-NEXT:    v_mad_f32 v0, v0, 4.0, s0
+; GFX908-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX908-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: global_atomic_fadd_ret_f32:
 ; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_mov_b64 s[4:5], exec
+; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT:    ; implicit-def: $vgpr1
+; GFX90A-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX90A-NEXT:    s_cbranch_execz .LBB0_4
+; GFX90A-NEXT:  ; %bb.1:
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT:    s_mov_b64 s[2:3], 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_bcnt1_i32_b64 s7, s[4:5]
+; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v1, s7
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_mul_f32_e32 v2, 4.0, v1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX90A-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
-; GFX90A-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
-; GFX90A-NEXT:    v_add_f32_e32 v2, 4.0, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v1
+; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc
+; GFX90A-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX90A-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX90A-NEXT:  ; %bb.3: ; %Flow
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:  .LBB0_4: ; %Flow1
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX90A-NEXT:    global_store_dword v[0:1], v1, off
+; GFX90A-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX90A-NEXT:    v_mad_f32 v0, v0, 4.0, s0
+; GFX90A-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: global_atomic_fadd_ret_f32:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 s4, exec_lo
+; GFX10-NEXT:    s_mov_b32 s3, 0
+; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX10-NEXT:    ; implicit-def: $vgpr1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB0_4
+; GFX10-NEXT:  ; %bb.1:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, s4
+; GFX10-NEXT:    v_mul_f32_e32 v2, 4.0, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-NEXT:    s_load_dword s5, s[0:1], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    s_mov_b32 s2, 0
-; GFX10-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX10-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    s_or_b32 s3, vcc_lo, s3
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX10-NEXT:  ; %bb.3: ; %Flow
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT:  .LBB0_4: ; %Flow1
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT:    global_store_dword v[0:1], v1, off
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX10-NEXT:    v_mad_f32 v0, v0, 4.0, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_fadd_ret_f32:
 ; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s4, exec_lo
+; GFX11-NEXT:    s_mov_b32 s3, 0
+; GFX11-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    ; implicit-def: $vgpr1
+; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT:    s_cbranch_execz .LBB0_4
+; GFX11-NEXT:  ; %bb.1:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, s4
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-NEXT:    s_load_b32 s5, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX11-NEXT:    v_dual_mul_f32 v2, 4.0, v1 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_mov_b32_e32 v2, v1
-; GFX11-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v1, v3, v[4:5], s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX11-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    s_or_b32 s3, vcc_lo, s3
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX11-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX11-NEXT:  ; %bb.3: ; %Flow
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX11-NEXT:  .LBB0_4: ; %Flow1
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX11-NEXT:    global_store_b32 v[0:1], v1, off
+; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v0, 4.0, v0
+; GFX11-NEXT:    v_add_f32_e32 v0, s0, v0
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -151,54 +229,88 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0
 define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr) #2 {
 ; GFX900-LABEL: global_atomic_fadd_ret_f32_ieee:
 ; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_mov_b64 s[4:5], exec
+; GFX900-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX900-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT:    ; implicit-def: $vgpr1
+; GFX900-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX900-NEXT:    s_cbranch_execz .LBB1_4
+; GFX900-NEXT:  ; %bb.1:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX900-NEXT:    s_mov_b64 s[2:3], 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    s_bcnt1_i32_b64 s7, s[4:5]
+; GFX900-NEXT:    v_cvt_f32_ubyte0_e32 v1, s7
+; GFX900-NEXT:    s_mov_b64 s[4:5], 0
+; GFX900-NEXT:    v_mul_f32_e32 v2, 4.0, v1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v1, s4
-; GFX900-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX900-NEXT:    v_mov_b32_e32 v1, s6
+; GFX900-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX900-NEXT:    v_mov_b32_e32 v5, v1
+; GFX900-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX900-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_wbinvl1_vol
-; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX900-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX900-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_cbranch_execnz .LBB1_2
+; GFX900-NEXT:  ; %bb.3: ; %Flow
+; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT:  .LBB1_4: ; %Flow1
 ; GFX900-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX900-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX900-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX900-NEXT:    v_mul_f32_e32 v0, 4.0, v0
+; GFX900-NEXT:    v_add_f32_e32 v0, s0, v0
+; GFX900-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX908-LABEL: global_atomic_fadd_ret_f32_ieee:
 ; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_mov_b64 s[4:5], exec
+; GFX908-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX908-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX908-NEXT:    ; implicit-def: $vgpr1
+; GFX908-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX908-NEXT:    s_cbranch_execz .LBB1_4
+; GFX908-NEXT:  ; %bb.1:
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX908-NEXT:    s_mov_b64 s[2:3], 0
-; GFX908-NEXT:    v_mov_b32_e32 v0, 0
+; GFX908-NEXT:    s_bcnt1_i32_b64 s7, s[4:5]
+; GFX908-NEXT:    v_cvt_f32_ubyte0_e32 v1, s7
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mul_f32_e32 v2, 4.0, v1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX908-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX908-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, s4
-; GFX908-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX908-NEXT:    v_mov_b32_e32 v1, s6
+; GFX908-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_mov_b32_e32 v2, v1
-; GFX908-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX908-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1_vol
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX908-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX908-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB1_2
+; GFX908-NEXT:  ; %bb.3: ; %Flow
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:  .LBB1_4: ; %Flow1
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX908-NEXT:    global_store_dword v[0:1], v1, off
+; GFX908-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX908-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX908-NEXT:    v_mul_f32_e32 v0, 4.0, v0
+; GFX908-NEXT:    v_add_f32_e32 v0, s0, v0
+; GFX908-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX908-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: global_atomic_fadd_ret_f32_ieee:
@@ -231,30 +343,46 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr
 ;
 ; GFX10-LABEL: global_atomic_fadd_ret_f32_ieee:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 s4, exec_lo
+; GFX10-NEXT:    s_mov_b32 s3, 0
+; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX10-NEXT:    ; implicit-def: $vgpr1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB1_4
+; GFX10-NEXT:  ; %bb.1:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, s4
+; GFX10-NEXT:    v_mul_f32_e32 v2, 4.0, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-NEXT:    s_load_dword s5, s[0:1], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    s_mov_b32 s2, 0
-; GFX10-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX10-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    s_or_b32 s3, vcc_lo, s3
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT:    s_cbranch_execnz .LBB1_2
+; GFX10-NEXT:  ; %bb.3: ; %Flow
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT:  .LBB1_4: ; %Flow1
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT:    global_store_dword v[0:1], v1, off
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, 4.0, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, s0, v0
+; GFX10-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_fadd_ret_f32_ieee:
@@ -294,26 +422,36 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr
 define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) #0 {
 ; GFX900-LABEL: global_atomic_fadd_noret_f32:
 ; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_mov_b64 s[2:3], exec
+; GFX900-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX900-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX900-NEXT:    s_cbranch_execz .LBB2_3
+; GFX900-NEXT:  ; %bb.1:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX900-NEXT:    s_bcnt1_i32_b64 s5, s[2:3]
+; GFX900-NEXT:    v_cvt_f32_ubyte0_e32 v0, s5
 ; GFX900-NEXT:    s_mov_b64 s[2:3], 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v1, s4
-; GFX900-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX900-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX900-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX900-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_wbinvl1_vol
 ; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX900-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX900-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX900-NEXT:  .LBB2_3:
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX908-LABEL: global_atomic_fadd_noret_f32:
@@ -360,19 +498,28 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) #
 ;
 ; GFX10-LABEL: global_atomic_fadd_noret_f32:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 s3, exec_lo
+; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB2_3
+; GFX10-NEXT:  ; %bb.1:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
+; GFX10-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    s_mov_b32 s2, 0
-; GFX10-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX10-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
@@ -380,8 +527,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) #
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX10-NEXT:  .LBB2_3:
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_fadd_noret_f32:
@@ -411,26 +558,36 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) #
 define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %ptr) #2 {
 ; GFX900-LABEL: global_atomic_fadd_noret_f32_ieee:
 ; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_mov_b64 s[2:3], exec
+; GFX900-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX900-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX900-NEXT:    s_cbranch_execz .LBB3_3
+; GFX900-NEXT:  ; %bb.1:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX900-NEXT:    s_bcnt1_i32_b64 s5, s[2:3]
+; GFX900-NEXT:    v_cvt_f32_ubyte0_e32 v0, s5
 ; GFX900-NEXT:    s_mov_b64 s[2:3], 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v1, s4
-; GFX900-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX900-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX900-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX900-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_wbinvl1_vol
 ; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX900-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX900-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_cbranch_execnz .LBB3_2
+; GFX900-NEXT:  .LBB3_3:
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX908-LABEL: global_atomic_fadd_noret_f32_ieee:
@@ -477,19 +634,28 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p
 ;
 ; GFX10-LABEL: global_atomic_fadd_noret_f32_ieee:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 s3, exec_lo
+; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB3_3
+; GFX10-NEXT:  ; %bb.1:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
+; GFX10-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    s_mov_b32 s2, 0
-; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX10-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
@@ -497,8 +663,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_cbranch_execnz .LBB3_2
+; GFX10-NEXT:  .LBB3_3:
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_fadd_noret_f32_ieee:
@@ -528,54 +694,86 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p
 define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %ptr) #0 {
 ; GFX900-LABEL: global_atomic_fadd_ret_f32_agent:
 ; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_mov_b64 s[4:5], exec
+; GFX900-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX900-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT:    ; implicit-def: $vgpr1
+; GFX900-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX900-NEXT:    s_cbranch_execz .LBB4_4
+; GFX900-NEXT:  ; %bb.1:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX900-NEXT:    s_mov_b64 s[2:3], 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    s_bcnt1_i32_b64 s7, s[4:5]
+; GFX900-NEXT:    v_cvt_f32_ubyte0_e32 v1, s7
+; GFX900-NEXT:    s_mov_b64 s[4:5], 0
+; GFX900-NEXT:    v_mul_f32_e32 v2, 4.0, v1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v1, s4
-; GFX900-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX900-NEXT:    v_mov_b32_e32 v1, s6
+; GFX900-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX900-NEXT:    v_mov_b32_e32 v5, v1
+; GFX900-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX900-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_wbinvl1_vol
-; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX900-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX900-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX900-NEXT:  ; %bb.3: ; %Flow
+; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT:  .LBB4_4: ; %Flow1
 ; GFX900-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX900-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX900-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX900-NEXT:    v_mad_f32 v0, v0, 4.0, s0
+; GFX900-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX908-LABEL: global_atomic_fadd_ret_f32_agent:
 ; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_mov_b64 s[4:5], exec
+; GFX908-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX908-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX908-NEXT:    ; implicit-def: $vgpr1
+; GFX908-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX908-NEXT:    s_cbranch_execz .LBB4_4
+; GFX908-NEXT:  ; %bb.1:
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX908-NEXT:    s_mov_b64 s[2:3], 0
-; GFX908-NEXT:    v_mov_b32_e32 v0, 0
+; GFX908-NEXT:    s_bcnt1_i32_b64 s7, s[4:5]
+; GFX908-NEXT:    v_cvt_f32_ubyte0_e32 v1, s7
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mul_f32_e32 v2, 4.0, v1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX908-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX908-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, s4
-; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX908-NEXT:    v_mov_b32_e32 v1, s6
+; GFX908-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_mov_b32_e32 v2, v1
-; GFX908-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX908-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1_vol
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX908-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX908-NEXT:  ; %bb.3: ; %Flow
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:  .LBB4_4: ; %Flow1
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX908-NEXT:    global_store_dword v[0:1], v1, off
+; GFX908-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX908-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX908-NEXT:    v_mad_f32 v0, v0, 4.0, s0
+; GFX908-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX908-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: global_atomic_fadd_ret_f32_agent:
@@ -607,30 +805,45 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt
 ;
 ; GFX10-LABEL: global_atomic_fadd_ret_f32_agent:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 s4, exec_lo
+; GFX10-NEXT:    s_mov_b32 s3, 0
+; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX10-NEXT:    ; implicit-def: $vgpr1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB4_4
+; GFX10-NEXT:  ; %bb.1:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, s4
+; GFX10-NEXT:    v_mul_f32_e32 v2, 4.0, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-NEXT:    s_load_dword s5, s[0:1], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    s_mov_b32 s2, 0
-; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX10-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    s_or_b32 s3, vcc_lo, s3
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX10-NEXT:  ; %bb.3: ; %Flow
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT:  .LBB4_4: ; %Flow1
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT:    global_store_dword v[0:1], v1, off
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX10-NEXT:    v_mad_f32 v0, v0, 4.0, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_fadd_ret_f32_agent:
@@ -670,138 +883,216 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt
 define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %ptr) #0 {
 ; GFX900-LABEL: global_atomic_fadd_ret_f32_system:
 ; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_mov_b64 s[4:5], exec
+; GFX900-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX900-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT:    ; implicit-def: $vgpr1
+; GFX900-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX900-NEXT:    s_cbranch_execz .LBB5_4
+; GFX900-NEXT:  ; %bb.1:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX900-NEXT:    s_mov_b64 s[2:3], 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    s_bcnt1_i32_b64 s7, s[4:5]
+; GFX900-NEXT:    v_cvt_f32_ubyte0_e32 v1, s7
+; GFX900-NEXT:    s_mov_b64 s[4:5], 0
+; GFX900-NEXT:    v_mul_f32_e32 v2, 4.0, v1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v1, s4
-; GFX900-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX900-NEXT:    v_mov_b32_e32 v1, s6
+; GFX900-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX900-NEXT:    v_mov_b32_e32 v5, v1
+; GFX900-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX900-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_wbinvl1_vol
-; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX900-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX900-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_cbranch_execnz .LBB5_2
+; GFX900-NEXT:  ; %bb.3: ; %Flow
+; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT:  .LBB5_4: ; %Flow1
 ; GFX900-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX900-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX900-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX900-NEXT:    v_mad_f32 v0, v0, 4.0, s0
+; GFX900-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX908-LABEL: global_atomic_fadd_ret_f32_system:
 ; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_mov_b64 s[4:5], exec
+; GFX908-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX908-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX908-NEXT:    ; implicit-def: $vgpr1
+; GFX908-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX908-NEXT:    s_cbranch_execz .LBB5_4
+; GFX908-NEXT:  ; %bb.1:
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX908-NEXT:    s_mov_b64 s[2:3], 0
-; GFX908-NEXT:    v_mov_b32_e32 v0, 0
+; GFX908-NEXT:    s_bcnt1_i32_b64 s7, s[4:5]
+; GFX908-NEXT:    v_cvt_f32_ubyte0_e32 v1, s7
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mul_f32_e32 v2, 4.0, v1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX908-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX908-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v1, s4
-; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX908-NEXT:    v_mov_b32_e32 v1, s6
+; GFX908-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_mov_b32_e32 v2, v1
-; GFX908-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
+; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX908-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1_vol
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX908-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB5_2
+; GFX908-NEXT:  ; %bb.3: ; %Flow
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:  .LBB5_4: ; %Flow1
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX908-NEXT:    global_store_dword v[0:1], v1, off
+; GFX908-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX908-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX908-NEXT:    v_mad_f32 v0, v0, 4.0, s0
+; GFX908-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX908-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: global_atomic_fadd_ret_f32_system:
 ; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_mov_b64 s[4:5], exec
+; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT:    ; implicit-def: $vgpr1
+; GFX90A-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX90A-NEXT:    s_cbranch_execz .LBB5_4
+; GFX90A-NEXT:  ; %bb.1:
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT:    s_mov_b64 s[2:3], 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_bcnt1_i32_b64 s7, s[4:5]
+; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v1, s7
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_mul_f32_e32 v2, 4.0, v1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX90A-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
-; GFX90A-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
-; GFX90A-NEXT:    v_add_f32_e32 v2, 4.0, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v1
+; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc
+; GFX90A-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX90A-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB5_2
+; GFX90A-NEXT:  ; %bb.3: ; %Flow
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:  .LBB5_4: ; %Flow1
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX90A-NEXT:    global_store_dword v[0:1], v1, off
+; GFX90A-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX90A-NEXT:    v_mad_f32 v0, v0, 4.0, s0
+; GFX90A-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: global_atomic_fadd_ret_f32_system:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 s4, exec_lo
+; GFX10-NEXT:    s_mov_b32 s3, 0
+; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX10-NEXT:    ; implicit-def: $vgpr1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB5_4
+; GFX10-NEXT:  ; %bb.1:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, s4
+; GFX10-NEXT:    v_mul_f32_e32 v2, 4.0, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-NEXT:    s_load_dword s5, s[0:1], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    s_mov_b32 s2, 0
-; GFX10-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX10-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    s_or_b32 s3, vcc_lo, s3
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT:    s_cbranch_execnz .LBB5_2
+; GFX10-NEXT:  ; %bb.3: ; %Flow
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT:  .LBB5_4: ; %Flow1
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT:    global_store_dword v[0:1], v1, off
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX10-NEXT:    v_mad_f32 v0, v0, 4.0, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_fadd_ret_f32_system:
 ; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s4, exec_lo
+; GFX11-NEXT:    s_mov_b32 s3, 0
+; GFX11-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    ; implicit-def: $vgpr1
+; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT:    s_cbranch_execz .LBB5_4
+; GFX11-NEXT:  ; %bb.1:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, s4
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-NEXT:    s_load_b32 s5, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX11-NEXT:    v_dual_mul_f32 v2, 4.0, v1 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_mov_b32_e32 v2, v1
-; GFX11-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v1, v3, v[4:5], s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    s_or_b32 s3, vcc_lo, s3
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX11-NEXT:    s_cbranch_execnz .LBB5_2
+; GFX11-NEXT:  ; %bb.3: ; %Flow
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX11-NEXT:  .LBB5_4: ; %Flow1
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX11-NEXT:    global_store_b32 v[0:1], v1, off
+; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v0, 4.0, v0
+; GFX11-NEXT:    v_add_f32_e32 v0, s0, v0
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -813,54 +1104,86 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p
 define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #1 {
 ; GCN-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b64 s[4:5], exec
+; GCN-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GCN-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    ; implicit-def: $vgpr1
+; GCN-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GCN-NEXT:    s_cbranch_execz .LBB6_4
+; GCN-NEXT:  ; %bb.1:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b64 s[2:3], 0
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_bcnt1_i32_b64 s7, s[4:5]
+; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    v_mul_f32_e32 v2, 4.0, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GCN-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, s4
-; GCN-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GCN-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GCN-NEXT:    v_mov_b32_e32 v5, v1
+; GCN-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GCN-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_wbinvl1_vol
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GCN-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GCN-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GCN-NEXT:    s_cbranch_execnz .LBB6_1
-; GCN-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GCN-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN-NEXT:    s_cbranch_execnz .LBB6_2
+; GCN-NEXT:  ; %bb.3: ; %Flow
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:  .LBB6_4: ; %Flow1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GCN-NEXT:    global_store_dword v[0:1], v1, off
+; GCN-NEXT:    v_readfirstlane_b32 s0, v1
+; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GCN-NEXT:    v_mad_f32 v0, v0, 4.0, s0
+; GCN-NEXT:    global_store_dword v[0:1], v0, off
 ; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget:
 ; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b64 s[4:5], exec
+; GFX11-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX11-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX11-NEXT:    ; implicit-def: $vgpr1
+; GFX11-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX11-NEXT:    s_cbranch_execz .LBB6_4
+; GFX11-NEXT:  ; %bb.1:
 ; GFX11-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[2:3], 0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_bcnt1_i32_b64 s7, s[4:5]
+; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, s7
+; GFX11-NEXT:    s_mov_b64 s[4:5], 0
+; GFX11-NEXT:    v_mul_f32_e32 v2, 4.0, v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX11-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s4
-; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX11-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_mov_b32_e32 v2, v1
-; GFX11-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX11-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_wbinvl1_vol
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX11-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX11-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX11-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX11-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX11-NEXT:    s_cbranch_execnz .LBB6_2
+; GFX11-NEXT:  ; %bb.3: ; %Flow
+; GFX11-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX11-NEXT:  .LBB6_4: ; %Flow1
 ; GFX11-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX11-NEXT:    global_store_dword v[0:1], v1, off
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX11-NEXT:    v_mad_f32 v0, v0, 4.0, s0
+; GFX11-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
   store float %result, ptr addrspace(1) undef
@@ -916,91 +1239,130 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addr
 define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %ptr) {
 ; GFX900-LABEL: global_atomic_fadd_noret_f32_safe:
 ; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_mov_b64 s[2:3], exec
+; GFX900-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX900-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX900-NEXT:    s_cbranch_execz .LBB8_3
+; GFX900-NEXT:  ; %bb.1:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX900-NEXT:    s_bcnt1_i32_b64 s5, s[2:3]
+; GFX900-NEXT:    v_cvt_f32_ubyte0_e32 v0, s5
 ; GFX900-NEXT:    s_mov_b64 s[2:3], 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v1, s4
-; GFX900-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX900-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX900-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX900-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_wbinvl1_vol
 ; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX900-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX900-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_cbranch_execnz .LBB8_2
+; GFX900-NEXT:  .LBB8_3:
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX908-LABEL: global_atomic_fadd_noret_f32_safe:
 ; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_mov_b64 s[2:3], exec
+; GFX908-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX908-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT:    s_cbranch_execz .LBB8_3
+; GFX908-NEXT:  ; %bb.1:
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX908-NEXT:    s_bcnt1_i32_b64 s5, s[2:3]
+; GFX908-NEXT:    v_cvt_f32_ubyte0_e32 v0, s5
 ; GFX908-NEXT:    s_mov_b64 s[2:3], 0
-; GFX908-NEXT:    v_mov_b32_e32 v2, 0
+; GFX908-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX908-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s4
-; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX908-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX908-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1_vol
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX908-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_cbranch_execnz .LBB8_2
+; GFX908-NEXT:  .LBB8_3:
 ; GFX908-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: global_atomic_fadd_noret_f32_safe:
 ; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_mov_b64 s[2:3], exec
+; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT:    s_cbranch_execz .LBB8_3
+; GFX90A-NEXT:  ; %bb.1:
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT:    s_bcnt1_i32_b64 s5, s[2:3]
+; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v0, s5
 ; GFX90A-NEXT:    s_mov_b64 s[2:3], 0
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
-; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX90A-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX90A-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX90A-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_cbranch_execnz .LBB8_2
+; GFX90A-NEXT:  .LBB8_3:
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: global_atomic_fadd_noret_f32_safe:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 s3, exec_lo
+; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB8_3
+; GFX10-NEXT:  ; %bb.1:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
+; GFX10-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    s_mov_b32 s2, 0
-; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX10-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
@@ -1008,25 +1370,33 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_cbranch_execnz .LBB8_2
+; GFX10-NEXT:  .LBB8_3:
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_atomic_fadd_noret_f32_safe:
 ; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-NEXT:    s_mov_b32 s2, 0
+; GFX11-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX11-NEXT:    s_mov_b32 s4, exec_lo
+; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT:    s_cbranch_execz .LBB8_3
+; GFX11-NEXT:  ; %bb.1:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT:    v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX11-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX11-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -1034,8 +1404,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_cbranch_execnz .LBB8_2
+; GFX11-NEXT:  .LBB8_3:
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
   ret void
@@ -1044,26 +1414,35 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p
 define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 {
 ; GFX900-LABEL: infer_as_before_atomic:
 ; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_mov_b64 s[2:3], exec
+; GFX900-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX900-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX900-NEXT:    s_cbranch_execz .LBB9_3
+; GFX900-NEXT:  ; %bb.1:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX900-NEXT:    s_bcnt1_i32_b64 s5, s[2:3]
 ; GFX900-NEXT:    s_mov_b64 s[2:3], 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    v_cvt_f32_ubyte0_e32 v2, s5
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v1, s4
-; GFX900-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX900-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT:    v_add_f32_e32 v0, 1.0, v1
-; GFX900-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX900-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX900-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX900-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX900-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_cbranch_execnz .LBB9_2
+; GFX900-NEXT:  .LBB9_3:
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX908-LABEL: infer_as_before_atomic:
@@ -1108,26 +1487,34 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 {
 ;
 ; GFX10-LABEL: infer_as_before_atomic:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 s3, exec_lo
+; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB9_3
+; GFX10-NEXT:  ; %bb.1:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v2, s3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    s_mov_b32 s2, 0
-; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_add_f32_e32 v0, 1.0, v1
-; GFX10-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX10-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_cbranch_execnz .LBB9_2
+; GFX10-NEXT:  .LBB9_3:
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: infer_as_before_atomic:

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 7ba4fb48d16f99..429bdd805ec5e1 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -17,94 +17,133 @@ declare float @div.float.value()
 define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
 ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
 ; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT:    s_cbranch_execz .LBB0_3
+; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX7LESS-NEXT:  .LBB0_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB0_3
+; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_bcnt1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, s5
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX9-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX9-NEXT:  .LBB0_3:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1064-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX1064-NEXT:  .LBB0_3:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:    s_mov_b32 s2, 0
-; GFX1032-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1032-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX1032-NEXT:  .LBB0_3:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
@@ -153,68 +192,97 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ;
 ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT:    s_cbranch_execz .LBB0_3
+; GFX9-DPP-NEXT:  ; %bb.1:
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s5, s[2:3]
+; GFX9-DPP-NEXT:    v_cvt_f32_ubyte0_e32 v0, s5
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX9-DPP-NEXT:  .LBB0_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1064-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX1064-DPP-NEXT:  .LBB0_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1032-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX1032-DPP-NEXT:  .LBB0_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
@@ -324,11 +392,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s36, s36, s11
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -343,23 +411,47 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-NEXT:  .LBB1_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
+; GFX9-NEXT:    s_add_i32 s2, s2, 32
+; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
+; GFX9-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execz .LBB1_5
+; GFX9-NEXT:  ; %bb.3:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX9-NEXT:  .LBB1_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX9-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB1_4
+; GFX9-NEXT:  .LBB1_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -369,43 +461,67 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1064-NEXT:    s_mov_b32 s12, s8
-; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_mov_b32 s13, s9
-; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1064-NEXT:    s_getpc_b64 s[6:7]
-; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1064-NEXT:    s_getpc_b64 s[4:5]
+; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1064-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064-NEXT:  .LBB1_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
+; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execz .LBB1_5
+; GFX1064-NEXT:  ; %bb.3:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1064-NEXT:  .LBB1_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB1_4
+; GFX1064-NEXT:  .LBB1_5:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -415,43 +531,63 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1032-NEXT:    s_mov_b32 s12, s8
-; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_mov_b32 s13, s9
-; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1032-NEXT:    s_getpc_b64 s[6:7]
-; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1032-NEXT:    s_getpc_b64 s[4:5]
+; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s0, 0
-; GFX1032-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1032-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032-NEXT:  .LBB1_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execz .LBB1_5
+; GFX1032-NEXT:  ; %bb.3:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1032-NEXT:  .LBB1_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB1_4
+; GFX1032-NEXT:  .LBB1_5:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -566,11 +702,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -585,23 +721,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT:    s_nop 0
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT:    s_cbranch_execz .LBB1_3
+; GFX9-DPP-NEXT:  ; %bb.1:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX9-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT:    v_add_f32_e32 v0, s4, v1
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB1_2
+; GFX9-DPP-NEXT:  .LBB1_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -611,43 +786,80 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB1_3
+; GFX1064-DPP-NEXT:  ; %bb.1:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB1_2
+; GFX1064-DPP-NEXT:  .LBB1_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -657,43 +869,74 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB1_3
+; GFX1032-DPP-NEXT:  ; %bb.1:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB1_2
+; GFX1032-DPP-NEXT:  .LBB1_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -839,256 +1082,466 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
 ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s10, -1
+; GFX7LESS-NEXT:    s_mov_b32 s11, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s8, s8, s3
+; GFX7LESS-NEXT:    s_addc_u32 s9, s9, 0
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT:    s_cbranch_execz .LBB2_3
+; GFX7LESS-NEXT:  ; %bb.1:
+; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX7LESS-NEXT:  .LBB2_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9-NEXT:    s_add_u32 s8, s8, s3
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB2_3
+; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
+; GFX9-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    v_add_f32_e32 v1, v2, v3
+; GFX9-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX9-NEXT:  .LBB2_3:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s10, -1
+; GFX1064-NEXT:    s_mov_b32 s11, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s8, s8, s3
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1064-NEXT:    s_mov_b32 s2, 0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1064-NEXT:  ; %bb.1:
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1064-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX1064-NEXT:  .LBB2_3:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s10, -1
+; GFX1032-NEXT:    s_mov_b32 s11, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s8, s8, s3
+; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1032-NEXT:  ; %bb.1:
+; GFX1032-NEXT:    s_bcnt1_i32_b32 s4, s3
+; GFX1032-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1032-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:    s_mov_b32 s2, 0
-; GFX1032-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX1032-NEXT:  .LBB2_3:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, exec
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b32 s2, 0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    s_clause 0x1
+; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1164-NEXT:  ; %bb.1:
+; GFX1164-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1164-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX1164-NEXT:  .LBB2_3:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    s_clause 0x1
+; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1132-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1132-NEXT:  ; %bb.1:
+; GFX1132-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:    s_mov_b32 s2, 0
-; GFX1132-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1132-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX1132-NEXT:  .LBB2_3:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s10, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s8, s8, s3
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX9-DPP-NEXT:  ; %bb.1:
+; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
+; GFX9-DPP-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    v_add_f32_e32 v1, v2, v3
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX9-DPP-NEXT:  .LBB2_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s10, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s11, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s8, s3
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1064-DPP-NEXT:  ; %bb.1:
+; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-DPP-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX1064-DPP-NEXT:  .LBB2_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s10, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s11, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s8, s3
+; GFX1032-DPP-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1032-DPP-NEXT:  ; %bb.1:
+; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s4, s3
+; GFX1032-DPP-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1032-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX1032-DPP-NEXT:  .LBB2_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_bcnt1_i32_b64 s2, exec
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT:    s_clause 0x1
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1164-DPP-NEXT:  ; %bb.1:
+; GFX1164-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX1164-DPP-NEXT:  .LBB2_3:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT:    s_clause 0x1
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1132-DPP-NEXT:  ; %bb.1:
+; GFX1132-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX1132-DPP-NEXT:  .LBB2_3:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic
   ret void
@@ -1155,11 +1608,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s36, s36, s11
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -1174,23 +1627,47 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
+; GFX9-NEXT:    s_add_i32 s2, s2, 32
+; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
+; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execz .LBB3_5
+; GFX9-NEXT:  ; %bb.3:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX9-NEXT:  .LBB3_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX9-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB3_4
+; GFX9-NEXT:  .LBB3_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
@@ -1200,43 +1677,67 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1064-NEXT:    s_mov_b32 s12, s8
-; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_mov_b32 s13, s9
-; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1064-NEXT:    s_getpc_b64 s[6:7]
-; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1064-NEXT:    s_getpc_b64 s[4:5]
+; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1064-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
+; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1064-NEXT:  ; %bb.3:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1064-NEXT:  .LBB3_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB3_4
+; GFX1064-NEXT:  .LBB3_5:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
@@ -1246,94 +1747,142 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1032-NEXT:    s_mov_b32 s12, s8
-; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_mov_b32 s13, s9
-; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1032-NEXT:    s_getpc_b64 s[6:7]
-; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1032-NEXT:    s_getpc_b64 s[4:5]
+; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s0, 0
-; GFX1032-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1032-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1032-NEXT:  ; %bb.3:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1032-NEXT:  .LBB3_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB3_4
+; GFX1032-NEXT:  .LBB3_5:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1164-NEXT:    s_mov_b32 s12, s8
-; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
-; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1164-NEXT:    s_getpc_b64 s[4:5]
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1164-NEXT:    s_getpc_b64 s[6:7]
-; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1164-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1164-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1164-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
+; GFX1164-NEXT:    s_add_i32 s2, s2, 32
+; GFX1164-NEXT:    s_min_u32 s2, s3, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1164-NEXT:  ; %bb.3:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1164-NEXT:  .LBB3_4: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB3_4
+; GFX1164-NEXT:  .LBB3_5:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_add_u32 s8, s4, 44
-; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1132-NEXT:    s_getpc_b64 s[6:7]
-; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1132-NEXT:    s_add_u32 s8, s34, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1132-NEXT:    s_getpc_b64 s[4:5]
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
@@ -1341,21 +1890,46 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1132-NEXT:    s_mov_b32 s0, 0
-; GFX1132-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1132-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1132-NEXT:  ; %bb.3:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1132-NEXT:  .LBB3_4: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT:    s_cbranch_execnz .LBB3_4
+; GFX1132-NEXT:  .LBB3_5:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
@@ -1366,11 +1940,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -1385,23 +1959,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT:    s_nop 0
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX9-DPP-NEXT:  ; %bb.1:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX9-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT:    v_add_f32_e32 v0, s4, v1
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB3_2
+; GFX9-DPP-NEXT:  .LBB3_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
@@ -1411,140 +2024,254 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1064-DPP-NEXT:  ; %bb.1:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX1064-DPP-NEXT:    s_endpgm
-;
-; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB3_2
+; GFX1064-DPP-NEXT:  .LBB3_3:
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1032-DPP-NEXT:  ; %bb.1:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB3_2
+; GFX1032-DPP-NEXT:  .LBB3_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1164-DPP-NEXT:  ; %bb.1:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v4, v5, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB3_2
+; GFX1164-DPP-NEXT:  .LBB3_3:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
-; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
@@ -1552,21 +2279,60 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1132-DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1132-DPP-NEXT:  ; %bb.1:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v4, v5, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB3_2
+; GFX1132-DPP-NEXT:  .LBB3_3:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call float @div.float.value()
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic
@@ -1576,256 +2342,466 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{
 ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
 ; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s10, -1
+; GFX7LESS-NEXT:    s_mov_b32 s11, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s8, s8, s3
+; GFX7LESS-NEXT:    s_addc_u32 s9, s9, 0
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT:    s_cbranch_execz .LBB4_3
+; GFX7LESS-NEXT:  ; %bb.1:
+; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX7LESS-NEXT:  .LBB4_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9-NEXT:    s_add_u32 s8, s8, s3
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB4_3
+; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
+; GFX9-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    v_add_f32_e32 v1, v2, v3
+; GFX9-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX9-NEXT:  .LBB4_3:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s10, -1
+; GFX1064-NEXT:    s_mov_b32 s11, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s8, s8, s3
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1064-NEXT:    s_mov_b32 s2, 0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1064-NEXT:  ; %bb.1:
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1064-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX1064-NEXT:  .LBB4_3:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s10, -1
+; GFX1032-NEXT:    s_mov_b32 s11, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s8, s8, s3
+; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1032-NEXT:  ; %bb.1:
+; GFX1032-NEXT:    s_bcnt1_i32_b32 s4, s3
+; GFX1032-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1032-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:    s_mov_b32 s2, 0
-; GFX1032-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX1032-NEXT:  .LBB4_3:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, exec
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b32 s2, 0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    s_clause 0x1
+; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1164-NEXT:  ; %bb.1:
+; GFX1164-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1164-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX1164-NEXT:  .LBB4_3:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    s_clause 0x1
+; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1132-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1132-NEXT:  ; %bb.1:
+; GFX1132-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:    s_mov_b32 s2, 0
-; GFX1132-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1132-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX1132-NEXT:  .LBB4_3:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
 ; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s10, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s8, s8, s3
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX9-DPP-NEXT:  ; %bb.1:
+; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
+; GFX9-DPP-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    v_add_f32_e32 v1, v2, v3
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX9-DPP-NEXT:  .LBB4_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s10, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s11, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s8, s3
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1064-DPP-NEXT:  ; %bb.1:
+; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-DPP-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX1064-DPP-NEXT:  .LBB4_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s10, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s11, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s8, s3
+; GFX1032-DPP-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1032-DPP-NEXT:  ; %bb.1:
+; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s4, s3
+; GFX1032-DPP-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1032-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX1032-DPP-NEXT:  .LBB4_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_bcnt1_i32_b64 s2, exec
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT:    s_clause 0x1
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1164-DPP-NEXT:  ; %bb.1:
+; GFX1164-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX1164-DPP-NEXT:  .LBB4_3:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT:    s_clause 0x1
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1132-DPP-NEXT:  ; %bb.1:
+; GFX1132-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX1132-DPP-NEXT:  .LBB4_3:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic
   ret void
@@ -1892,11 +2868,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s36, s36, s11
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -1911,23 +2887,47 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
+; GFX9-NEXT:    s_add_i32 s2, s2, 32
+; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
+; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execz .LBB5_5
+; GFX9-NEXT:  ; %bb.3:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX9-NEXT:  .LBB5_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX9-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX9-NEXT:  .LBB5_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -1937,43 +2937,67 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1064-NEXT:    s_mov_b32 s12, s8
-; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_mov_b32 s13, s9
-; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1064-NEXT:    s_getpc_b64 s[6:7]
-; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1064-NEXT:    s_getpc_b64 s[4:5]
+; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1064-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
+; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1064-NEXT:  ; %bb.3:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1064-NEXT:  .LBB5_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX1064-NEXT:  .LBB5_5:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -1983,43 +3007,63 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1032-NEXT:    s_mov_b32 s12, s8
-; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_mov_b32 s13, s9
-; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1032-NEXT:    s_getpc_b64 s[6:7]
-; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1032-NEXT:    s_getpc_b64 s[4:5]
+; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s0, 0
-; GFX1032-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1032-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1032-NEXT:  ; %bb.3:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1032-NEXT:  .LBB5_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX1032-NEXT:  .LBB5_5:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -2134,11 +3178,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -2153,23 +3197,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT:    s_nop 0
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX9-DPP-NEXT:  ; %bb.1:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX9-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT:    v_add_f32_e32 v0, s4, v1
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB5_2
+; GFX9-DPP-NEXT:  .LBB5_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -2179,43 +3262,80 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1064-DPP-NEXT:  ; %bb.1:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB5_2
+; GFX1064-DPP-NEXT:  .LBB5_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -2225,43 +3345,74 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1032-DPP-NEXT:  ; %bb.1:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB5_2
+; GFX1032-DPP-NEXT:  .LBB5_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -2465,11 +3616,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s36, s36, s11
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -2484,23 +3635,47 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-NEXT:  .LBB6_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
+; GFX9-NEXT:    s_add_i32 s2, s2, 32
+; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
+; GFX9-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execz .LBB6_5
+; GFX9-NEXT:  ; %bb.3:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX9-NEXT:  .LBB6_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX9-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB6_4
+; GFX9-NEXT:  .LBB6_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2510,43 +3685,67 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1064-NEXT:    s_mov_b32 s12, s8
-; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_mov_b32 s13, s9
-; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1064-NEXT:    s_getpc_b64 s[6:7]
-; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1064-NEXT:    s_getpc_b64 s[4:5]
+; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1064-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064-NEXT:  .LBB6_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
+; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execz .LBB6_5
+; GFX1064-NEXT:  ; %bb.3:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1064-NEXT:  .LBB6_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB6_4
+; GFX1064-NEXT:  .LBB6_5:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2556,43 +3755,63 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1032-NEXT:    s_mov_b32 s12, s8
-; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_mov_b32 s13, s9
-; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1032-NEXT:    s_getpc_b64 s[6:7]
-; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1032-NEXT:    s_getpc_b64 s[4:5]
+; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s0, 0
-; GFX1032-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1032-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032-NEXT:  .LBB6_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execz .LBB6_5
+; GFX1032-NEXT:  ; %bb.3:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1032-NEXT:  .LBB6_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB6_4
+; GFX1032-NEXT:  .LBB6_5:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2707,11 +3926,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -2726,23 +3945,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT:    s_nop 0
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT:    s_cbranch_execz .LBB6_3
+; GFX9-DPP-NEXT:  ; %bb.1:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX9-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT:    v_add_f32_e32 v0, s4, v1
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB6_2
+; GFX9-DPP-NEXT:  .LBB6_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2752,43 +4010,80 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1064-DPP-NEXT:  ; %bb.1:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB6_2
+; GFX1064-DPP-NEXT:  .LBB6_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2798,43 +4093,74 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1032-DPP-NEXT:  ; %bb.1:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB6_2
+; GFX1032-DPP-NEXT:  .LBB6_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2980,256 +4306,466 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 {
 ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s10, -1
+; GFX7LESS-NEXT:    s_mov_b32 s11, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s8, s8, s3
+; GFX7LESS-NEXT:    s_addc_u32 s9, s9, 0
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_3
+; GFX7LESS-NEXT:  ; %bb.1:
+; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX7LESS-NEXT:  .LBB7_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9-NEXT:    s_add_u32 s8, s8, s3
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB7_3
+; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
+; GFX9-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    v_add_f32_e32 v1, v2, v3
+; GFX9-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX9-NEXT:  .LBB7_3:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s10, -1
+; GFX1064-NEXT:    s_mov_b32 s11, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s8, s8, s3
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1064-NEXT:    s_mov_b32 s2, 0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1064-NEXT:  ; %bb.1:
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1064-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX1064-NEXT:  .LBB7_3:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s10, -1
+; GFX1032-NEXT:    s_mov_b32 s11, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s8, s8, s3
+; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1032-NEXT:  ; %bb.1:
+; GFX1032-NEXT:    s_bcnt1_i32_b32 s4, s3
+; GFX1032-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1032-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:    s_mov_b32 s2, 0
-; GFX1032-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX1032-NEXT:  .LBB7_3:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, exec
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b32 s2, 0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    s_clause 0x1
+; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1164-NEXT:  ; %bb.1:
+; GFX1164-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1164-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX1164-NEXT:  .LBB7_3:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    s_clause 0x1
+; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1132-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1132-NEXT:  ; %bb.1:
+; GFX1132-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:    s_mov_b32 s2, 0
-; GFX1132-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1132-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX1132-NEXT:  .LBB7_3:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s10, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s8, s8, s3
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT:    s_cbranch_execz .LBB7_3
+; GFX9-DPP-NEXT:  ; %bb.1:
+; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
+; GFX9-DPP-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    v_add_f32_e32 v1, v2, v3
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX9-DPP-NEXT:  .LBB7_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s10, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s11, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s8, s3
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1064-DPP-NEXT:  ; %bb.1:
+; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-DPP-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX1064-DPP-NEXT:  .LBB7_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s10, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s11, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s8, s3
+; GFX1032-DPP-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1032-DPP-NEXT:  ; %bb.1:
+; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s4, s3
+; GFX1032-DPP-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1032-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX1032-DPP-NEXT:  .LBB7_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_bcnt1_i32_b64 s2, exec
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT:    s_clause 0x1
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1164-DPP-NEXT:  ; %bb.1:
+; GFX1164-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX1164-DPP-NEXT:  .LBB7_3:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT:    s_clause 0x1
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1132-DPP-NEXT:  ; %bb.1:
+; GFX1132-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX1132-DPP-NEXT:  .LBB7_3:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4
   ret void
@@ -3295,11 +4831,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s36, s36, s11
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -3314,23 +4850,47 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-NEXT:  .LBB8_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
+; GFX9-NEXT:    s_add_i32 s2, s2, 32
+; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
+; GFX9-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execz .LBB8_5
+; GFX9-NEXT:  ; %bb.3:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX9-NEXT:  .LBB8_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX9-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB8_4
+; GFX9-NEXT:  .LBB8_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
@@ -3340,43 +4900,67 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1064-NEXT:    s_mov_b32 s12, s8
-; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_mov_b32 s13, s9
-; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1064-NEXT:    s_getpc_b64 s[6:7]
-; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1064-NEXT:    s_getpc_b64 s[4:5]
+; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1064-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064-NEXT:  .LBB8_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
+; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execz .LBB8_5
+; GFX1064-NEXT:  ; %bb.3:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1064-NEXT:  .LBB8_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB8_4
+; GFX1064-NEXT:  .LBB8_5:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
@@ -3386,94 +4970,142 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1032-NEXT:    s_mov_b32 s12, s8
-; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_mov_b32 s13, s9
-; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1032-NEXT:    s_getpc_b64 s[6:7]
-; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1032-NEXT:    s_getpc_b64 s[4:5]
+; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s0, 0
-; GFX1032-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1032-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032-NEXT:  .LBB8_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execz .LBB8_5
+; GFX1032-NEXT:  ; %bb.3:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1032-NEXT:  .LBB8_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB8_4
+; GFX1032-NEXT:  .LBB8_5:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
 ; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1164-NEXT:    s_mov_b32 s12, s8
-; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
-; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1164-NEXT:    s_getpc_b64 s[4:5]
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1164-NEXT:    s_getpc_b64 s[6:7]
-; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1164-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1164-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1164-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-NEXT:  .LBB8_1: ; %ComputeLoop
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
+; GFX1164-NEXT:    s_add_i32 s2, s2, 32
+; GFX1164-NEXT:    s_min_u32 s2, s3, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execz .LBB8_5
+; GFX1164-NEXT:  ; %bb.3:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1164-NEXT:  .LBB8_4: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB8_4
+; GFX1164-NEXT:  .LBB8_5:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_add_u32 s8, s4, 44
-; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1132-NEXT:    s_getpc_b64 s[6:7]
-; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1132-NEXT:    s_add_u32 s8, s34, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1132-NEXT:    s_getpc_b64 s[4:5]
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
@@ -3481,21 +5113,46 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1132-NEXT:    s_mov_b32 s0, 0
-; GFX1132-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1132-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-NEXT:  .LBB8_1: ; %ComputeLoop
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_cbranch_execz .LBB8_5
+; GFX1132-NEXT:  ; %bb.3:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1132-NEXT:  .LBB8_4: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT:    s_cbranch_execnz .LBB8_4
+; GFX1132-NEXT:  .LBB8_5:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
@@ -3506,11 +5163,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -3525,23 +5182,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT:    s_nop 0
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT:    s_cbranch_execz .LBB8_3
+; GFX9-DPP-NEXT:  ; %bb.1:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX9-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT:    v_add_f32_e32 v0, s4, v1
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB8_2
+; GFX9-DPP-NEXT:  .LBB8_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
@@ -3551,43 +5247,80 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_add_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1064-DPP-NEXT:  ; %bb.1:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB8_2
+; GFX1064-DPP-NEXT:  .LBB8_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
@@ -3597,94 +5330,171 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1032-DPP-NEXT:  ; %bb.1:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB8_2
+; GFX1032-DPP-NEXT:  .LBB8_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
 ; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1164-DPP-NEXT:  ; %bb.1:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v4, v5, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB8_2
+; GFX1164-DPP-NEXT:  .LBB8_3:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
-; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
@@ -3692,21 +5502,60 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1132-DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1132-DPP-NEXT:  ; %bb.1:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    global_load_b32 v5, v6, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v4, v5, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB8_2
+; GFX1132-DPP-NEXT:  .LBB8_3:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call float @div.float.value()
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue monotonic, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index ffae7980a9d498..55a3b877181fe9 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -17,256 +17,368 @@ declare float @div.float.value()
 define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
 ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
 ; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT:    s_cbranch_execz .LBB0_3
+; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_add_f32_e32 v0, -4.0, v1
+; GFX7LESS-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX7LESS-NEXT:  .LBB0_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB0_3
+; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_bcnt1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, s5
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX9-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_add_f32_e32 v0, -4.0, v1
-; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX9-NEXT:  .LBB0_3:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1064-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_add_f32_e32 v0, -4.0, v1
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX1064-NEXT:  .LBB0_3:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:    s_mov_b32 s2, 0
-; GFX1032-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1032-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_add_f32_e32 v0, -4.0, v1
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX1032-NEXT:  .LBB0_3:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    s_load_b32 s4, s[0:1], 0x0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1164-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_f32_e32 v0, -4.0, v1
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX1164-NEXT:  .LBB0_3:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    s_load_b32 s4, s[0:1], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:    s_mov_b32 s2, 0
-; GFX1132-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_f32_e32 v0, -4.0, v1
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX1132-NEXT:  .LBB0_3:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT:    s_cbranch_execz .LBB0_3
+; GFX9-DPP-NEXT:  ; %bb.1:
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s5, s[2:3]
+; GFX9-DPP-NEXT:    v_cvt_f32_ubyte0_e32 v0, s5
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_add_f32_e32 v0, -4.0, v1
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX9-DPP-NEXT:  .LBB0_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1064-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, -4.0, v1
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX1064-DPP-NEXT:  .LBB0_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1032-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, -4.0, v1
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX1032-DPP-NEXT:  .LBB0_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164-DPP-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    s_load_b32 s4, s[0:1], 0x0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1164-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_add_f32_e32 v0, -4.0, v1
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX1164-DPP-NEXT:  .LBB0_3:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132-DPP-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_load_b32 s4, s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_add_f32_e32 v0, -4.0, v1
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX1132-DPP-NEXT:  .LBB0_3:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4
   ret void
@@ -332,11 +444,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s36, s36, s11
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -351,23 +463,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:  .LBB1_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
+; GFX9-NEXT:    s_add_i32 s2, s2, 32
+; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    v_subrev_f32_e32 v2, s4, v2
+; GFX9-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execz .LBB1_5
+; GFX9-NEXT:  ; %bb.3:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX9-NEXT:  .LBB1_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX9-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB1_4
+; GFX9-NEXT:  .LBB1_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
@@ -377,43 +513,67 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1064-NEXT:    s_mov_b32 s12, s8
-; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_mov_b32 s13, s9
-; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1064-NEXT:    s_getpc_b64 s[6:7]
-; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1064-NEXT:    s_getpc_b64 s[4:5]
+; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064-NEXT:  .LBB1_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
+; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT:    v_subrev_f32_e32 v2, s4, v2
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execz .LBB1_5
+; GFX1064-NEXT:  ; %bb.3:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1064-NEXT:  .LBB1_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB1_4
+; GFX1064-NEXT:  .LBB1_5:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
@@ -423,94 +583,142 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1032-NEXT:    s_mov_b32 s12, s8
-; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_mov_b32 s13, s9
-; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1032-NEXT:    s_getpc_b64 s[6:7]
-; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1032-NEXT:    s_getpc_b64 s[4:5]
+; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s0, 0
-; GFX1032-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032-NEXT:  .LBB1_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_subrev_f32_e32 v2, s2, v2
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execz .LBB1_5
+; GFX1032-NEXT:  ; %bb.3:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1032-NEXT:  .LBB1_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB1_4
+; GFX1032-NEXT:  .LBB1_5:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1164-NEXT:    s_mov_b32 s12, s8
-; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
-; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1164-NEXT:    s_getpc_b64 s[4:5]
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1164-NEXT:    s_getpc_b64 s[6:7]
-; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1164-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1164-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-NEXT:  .LBB1_1: ; %ComputeLoop
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
+; GFX1164-NEXT:    s_add_i32 s2, s2, 32
+; GFX1164-NEXT:    s_min_u32 s2, s3, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    v_subrev_f32_e32 v2, s4, v2
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execz .LBB1_5
+; GFX1164-NEXT:  ; %bb.3:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1164-NEXT:  .LBB1_4: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB1_4
+; GFX1164-NEXT:  .LBB1_5:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_add_u32 s8, s4, 44
-; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1132-NEXT:    s_getpc_b64 s[6:7]
-; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1132-NEXT:    s_add_u32 s8, s34, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1132-NEXT:    s_getpc_b64 s[4:5]
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
@@ -518,21 +726,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1132-NEXT:    s_mov_b32 s0, 0
-; GFX1132-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-NEXT:  .LBB1_1: ; %ComputeLoop
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_subrev_f32_e32 v2, s2, v2
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_cbranch_execz .LBB1_5
+; GFX1132-NEXT:  ; %bb.3:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1132-NEXT:  .LBB1_4: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT:    s_cbranch_execnz .LBB1_4
+; GFX1132-NEXT:  .LBB1_5:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
@@ -543,11 +776,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -562,23 +795,53 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT:    s_nop 0
+; GFX9-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT:    s_cbranch_execz .LBB1_3
+; GFX9-DPP-NEXT:  ; %bb.1:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX9-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT:    v_subrev_f32_e32 v0, s4, v1
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB1_2
+; GFX9-DPP-NEXT:  .LBB1_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
@@ -588,43 +851,70 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_sub_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB1_3
+; GFX1064-DPP-NEXT:  ; %bb.1:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT:    v_sub_f32_e32 v0, v1, v5
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB1_2
+; GFX1064-DPP-NEXT:  .LBB1_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
@@ -634,94 +924,148 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB1_3
+; GFX1032-DPP-NEXT:  ; %bb.1:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT:    v_sub_f32_e32 v0, v1, v5
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB1_2
+; GFX1032-DPP-NEXT:  .LBB1_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB1_3
+; GFX1164-DPP-NEXT:  ; %bb.1:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    global_load_b32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v3, v4, v5
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB1_2
+; GFX1164-DPP-NEXT:  .LBB1_3:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
-; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
@@ -729,21 +1073,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1132-DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_3
+; GFX1132-DPP-NEXT:  ; %bb.1:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    global_load_b32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v3, v4, v5
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB1_2
+; GFX1132-DPP-NEXT:  .LBB1_3:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call float @div.float.value()
   %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4
@@ -753,256 +1123,466 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
 ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s10, -1
+; GFX7LESS-NEXT:    s_mov_b32 s11, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s8, s8, s3
+; GFX7LESS-NEXT:    s_addc_u32 s9, s9, 0
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT:    s_cbranch_execz .LBB2_3
+; GFX7LESS-NEXT:  ; %bb.1:
+; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX7LESS-NEXT:  .LBB2_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9-NEXT:    s_add_u32 s8, s8, s3
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB2_3
+; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
+; GFX9-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    v_sub_f32_e32 v1, v2, v3
+; GFX9-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX9-NEXT:  .LBB2_3:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s10, -1
+; GFX1064-NEXT:    s_mov_b32 s11, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s8, s8, s3
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1064-NEXT:    s_mov_b32 s2, 0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1064-NEXT:  ; %bb.1:
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1064-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX1064-NEXT:  .LBB2_3:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s10, -1
+; GFX1032-NEXT:    s_mov_b32 s11, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s8, s8, s3
+; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1032-NEXT:  ; %bb.1:
+; GFX1032-NEXT:    s_bcnt1_i32_b32 s4, s3
+; GFX1032-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1032-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:    s_mov_b32 s2, 0
-; GFX1032-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX1032-NEXT:  .LBB2_3:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, exec
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b32 s2, 0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    s_clause 0x1
+; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1164-NEXT:  ; %bb.1:
+; GFX1164-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1164-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX1164-NEXT:  .LBB2_3:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    s_clause 0x1
+; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1132-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1132-NEXT:  ; %bb.1:
+; GFX1132-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:    s_mov_b32 s2, 0
-; GFX1132-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1132-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX1132-NEXT:  .LBB2_3:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s10, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s8, s8, s3
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX9-DPP-NEXT:  ; %bb.1:
+; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
+; GFX9-DPP-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v1, v2, v3
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX9-DPP-NEXT:  .LBB2_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s10, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s11, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s8, s3
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1064-DPP-NEXT:  ; %bb.1:
+; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-DPP-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX1064-DPP-NEXT:  .LBB2_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s10, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s11, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s8, s3
+; GFX1032-DPP-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1032-DPP-NEXT:  ; %bb.1:
+; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s4, s3
+; GFX1032-DPP-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1032-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX1032-DPP-NEXT:  .LBB2_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_bcnt1_i32_b64 s2, exec
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT:    s_clause 0x1
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1164-DPP-NEXT:  ; %bb.1:
+; GFX1164-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX1164-DPP-NEXT:  .LBB2_3:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT:    s_clause 0x1
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1132-DPP-NEXT:  ; %bb.1:
+; GFX1132-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX1132-DPP-NEXT:  .LBB2_3:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic
   ret void
@@ -1069,11 +1649,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s36, s36, s11
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -1088,23 +1668,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
+; GFX9-NEXT:    s_add_i32 s2, s2, 32
+; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    v_subrev_f32_e32 v2, s4, v2
+; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execz .LBB3_5
+; GFX9-NEXT:  ; %bb.3:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX9-NEXT:  .LBB3_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX9-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB3_4
+; GFX9-NEXT:  .LBB3_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
@@ -1114,43 +1718,67 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1064-NEXT:    s_mov_b32 s12, s8
-; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_mov_b32 s13, s9
-; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1064-NEXT:    s_getpc_b64 s[6:7]
-; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1064-NEXT:    s_getpc_b64 s[4:5]
+; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
+; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT:    v_subrev_f32_e32 v2, s4, v2
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1064-NEXT:  ; %bb.3:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1064-NEXT:  .LBB3_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB3_4
+; GFX1064-NEXT:  .LBB3_5:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
@@ -1160,94 +1788,142 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1032-NEXT:    s_mov_b32 s12, s8
-; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_mov_b32 s13, s9
-; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1032-NEXT:    s_getpc_b64 s[6:7]
-; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1032-NEXT:    s_getpc_b64 s[4:5]
+; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s0, 0
-; GFX1032-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_subrev_f32_e32 v2, s2, v2
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1032-NEXT:  ; %bb.3:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1032-NEXT:  .LBB3_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB3_4
+; GFX1032-NEXT:  .LBB3_5:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1164-NEXT:    s_mov_b32 s12, s8
-; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
-; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1164-NEXT:    s_getpc_b64 s[4:5]
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1164-NEXT:    s_getpc_b64 s[6:7]
-; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1164-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1164-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
+; GFX1164-NEXT:    s_add_i32 s2, s2, 32
+; GFX1164-NEXT:    s_min_u32 s2, s3, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    v_subrev_f32_e32 v2, s4, v2
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1164-NEXT:  ; %bb.3:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1164-NEXT:  .LBB3_4: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB3_4
+; GFX1164-NEXT:  .LBB3_5:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_add_u32 s8, s4, 44
-; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1132-NEXT:    s_getpc_b64 s[6:7]
-; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1132-NEXT:    s_add_u32 s8, s34, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1132-NEXT:    s_getpc_b64 s[4:5]
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
@@ -1255,21 +1931,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1132-NEXT:    s_mov_b32 s0, 0
-; GFX1132-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_subrev_f32_e32 v2, s2, v2
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1132-NEXT:  ; %bb.3:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1132-NEXT:  .LBB3_4: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT:    s_cbranch_execnz .LBB3_4
+; GFX1132-NEXT:  .LBB3_5:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
@@ -1280,11 +1981,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -1299,23 +2000,53 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT:    s_nop 0
+; GFX9-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX9-DPP-NEXT:  ; %bb.1:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX9-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT:    v_subrev_f32_e32 v0, s4, v1
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB3_2
+; GFX9-DPP-NEXT:  .LBB3_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
@@ -1325,43 +2056,70 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_sub_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1064-DPP-NEXT:  ; %bb.1:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT:    v_sub_f32_e32 v0, v1, v5
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB3_2
+; GFX1064-DPP-NEXT:  .LBB3_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
@@ -1371,94 +2129,148 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1032-DPP-NEXT:  ; %bb.1:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT:    v_sub_f32_e32 v0, v1, v5
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB3_2
+; GFX1032-DPP-NEXT:  .LBB3_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1164-DPP-NEXT:  ; %bb.1:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    global_load_b32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v3, v4, v5
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB3_2
+; GFX1164-DPP-NEXT:  .LBB3_3:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
-; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
@@ -1466,21 +2278,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1132-DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1132-DPP-NEXT:  ; %bb.1:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    global_load_b32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v3, v4, v5
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB3_2
+; GFX1132-DPP-NEXT:  .LBB3_3:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call float @div.float.value()
   %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic
@@ -1490,256 +2328,466 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{
 ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
 ; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s10, -1
+; GFX7LESS-NEXT:    s_mov_b32 s11, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s8, s8, s3
+; GFX7LESS-NEXT:    s_addc_u32 s9, s9, 0
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT:    s_cbranch_execz .LBB4_3
+; GFX7LESS-NEXT:  ; %bb.1:
+; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX7LESS-NEXT:  .LBB4_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9-NEXT:    s_add_u32 s8, s8, s3
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB4_3
+; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
+; GFX9-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    v_sub_f32_e32 v1, v2, v3
+; GFX9-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX9-NEXT:  .LBB4_3:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s10, -1
+; GFX1064-NEXT:    s_mov_b32 s11, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s8, s8, s3
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1064-NEXT:    s_mov_b32 s2, 0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1064-NEXT:  ; %bb.1:
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1064-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX1064-NEXT:  .LBB4_3:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s10, -1
+; GFX1032-NEXT:    s_mov_b32 s11, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s8, s8, s3
+; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1032-NEXT:  ; %bb.1:
+; GFX1032-NEXT:    s_bcnt1_i32_b32 s4, s3
+; GFX1032-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1032-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:    s_mov_b32 s2, 0
-; GFX1032-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX1032-NEXT:  .LBB4_3:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, exec
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b32 s2, 0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    s_clause 0x1
+; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1164-NEXT:  ; %bb.1:
+; GFX1164-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1164-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX1164-NEXT:  .LBB4_3:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    s_clause 0x1
+; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1132-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1132-NEXT:  ; %bb.1:
+; GFX1132-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:    s_mov_b32 s2, 0
-; GFX1132-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1132-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX1132-NEXT:  .LBB4_3:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
 ; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s10, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s8, s8, s3
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX9-DPP-NEXT:  ; %bb.1:
+; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
+; GFX9-DPP-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v1, v2, v3
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX9-DPP-NEXT:  .LBB4_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s10, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s11, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s8, s3
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1064-DPP-NEXT:  ; %bb.1:
+; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-DPP-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX1064-DPP-NEXT:  .LBB4_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s10, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s11, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s8, s3
+; GFX1032-DPP-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1032-DPP-NEXT:  ; %bb.1:
+; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s4, s3
+; GFX1032-DPP-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1032-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX1032-DPP-NEXT:  .LBB4_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_bcnt1_i32_b64 s2, exec
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT:    s_clause 0x1
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1164-DPP-NEXT:  ; %bb.1:
+; GFX1164-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX1164-DPP-NEXT:  .LBB4_3:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT:    s_clause 0x1
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1132-DPP-NEXT:  ; %bb.1:
+; GFX1132-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX1132-DPP-NEXT:  .LBB4_3:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic
   ret void
@@ -1806,11 +2854,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s36, s36, s11
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -1825,23 +2873,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
+; GFX9-NEXT:    s_add_i32 s2, s2, 32
+; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    v_subrev_f32_e32 v2, s4, v2
+; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execz .LBB5_5
+; GFX9-NEXT:  ; %bb.3:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX9-NEXT:  .LBB5_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX9-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX9-NEXT:  .LBB5_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
@@ -1851,43 +2923,67 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1064-NEXT:    s_mov_b32 s12, s8
-; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_mov_b32 s13, s9
-; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1064-NEXT:    s_getpc_b64 s[6:7]
-; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1064-NEXT:    s_getpc_b64 s[4:5]
+; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
+; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT:    v_subrev_f32_e32 v2, s4, v2
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1064-NEXT:  ; %bb.3:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1064-NEXT:  .LBB5_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX1064-NEXT:  .LBB5_5:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
@@ -1897,94 +2993,142 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1032-NEXT:    s_mov_b32 s12, s8
-; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_mov_b32 s13, s9
-; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1032-NEXT:    s_getpc_b64 s[6:7]
-; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1032-NEXT:    s_getpc_b64 s[4:5]
+; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s0, 0
-; GFX1032-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_subrev_f32_e32 v2, s2, v2
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1032-NEXT:  ; %bb.3:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1032-NEXT:  .LBB5_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX1032-NEXT:  .LBB5_5:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
 ; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1164-NEXT:    s_mov_b32 s12, s8
-; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
-; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1164-NEXT:    s_getpc_b64 s[4:5]
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1164-NEXT:    s_getpc_b64 s[6:7]
-; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1164-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1164-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
+; GFX1164-NEXT:    s_add_i32 s2, s2, 32
+; GFX1164-NEXT:    s_min_u32 s2, s3, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    v_subrev_f32_e32 v2, s4, v2
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1164-NEXT:  ; %bb.3:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1164-NEXT:  .LBB5_4: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX1164-NEXT:  .LBB5_5:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_add_u32 s8, s4, 44
-; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1132-NEXT:    s_getpc_b64 s[6:7]
-; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1132-NEXT:    s_add_u32 s8, s34, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1132-NEXT:    s_getpc_b64 s[4:5]
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
@@ -1992,21 +3136,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1132-NEXT:    s_mov_b32 s0, 0
-; GFX1132-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_subrev_f32_e32 v2, s2, v2
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1132-NEXT:  ; %bb.3:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1132-NEXT:  .LBB5_4: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX1132-NEXT:  .LBB5_5:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
@@ -2017,11 +3186,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -2036,23 +3205,53 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT:    s_nop 0
+; GFX9-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX9-DPP-NEXT:  ; %bb.1:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX9-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT:    v_subrev_f32_e32 v0, s4, v1
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB5_2
+; GFX9-DPP-NEXT:  .LBB5_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
@@ -2062,43 +3261,70 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_sub_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1064-DPP-NEXT:  ; %bb.1:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT:    v_sub_f32_e32 v0, v1, v5
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB5_2
+; GFX1064-DPP-NEXT:  .LBB5_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
@@ -2108,94 +3334,148 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1032-DPP-NEXT:  ; %bb.1:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT:    v_sub_f32_e32 v0, v1, v5
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB5_2
+; GFX1032-DPP-NEXT:  .LBB5_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1164-DPP-NEXT:  ; %bb.1:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    global_load_b32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v3, v4, v5
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB5_2
+; GFX1164-DPP-NEXT:  .LBB5_3:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
-; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
@@ -2203,21 +3483,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1132-DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1132-DPP-NEXT:  ; %bb.1:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    global_load_b32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v3, v4, v5
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB5_2
+; GFX1132-DPP-NEXT:  .LBB5_3:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call float @div.float.value()
   %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic
@@ -2285,11 +3591,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s36, s36, s11
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -2304,23 +3610,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:  .LBB6_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
+; GFX9-NEXT:    s_add_i32 s2, s2, 32
+; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    v_subrev_f32_e32 v2, s4, v2
+; GFX9-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execz .LBB6_5
+; GFX9-NEXT:  ; %bb.3:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX9-NEXT:  .LBB6_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX9-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB6_4
+; GFX9-NEXT:  .LBB6_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2330,43 +3660,67 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1064-NEXT:    s_mov_b32 s12, s8
-; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_mov_b32 s13, s9
-; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1064-NEXT:    s_getpc_b64 s[6:7]
-; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1064-NEXT:    s_getpc_b64 s[4:5]
+; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064-NEXT:  .LBB6_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
+; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT:    v_subrev_f32_e32 v2, s4, v2
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execz .LBB6_5
+; GFX1064-NEXT:  ; %bb.3:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1064-NEXT:  .LBB6_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB6_4
+; GFX1064-NEXT:  .LBB6_5:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2376,94 +3730,142 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1032-NEXT:    s_mov_b32 s12, s8
-; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_mov_b32 s13, s9
-; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1032-NEXT:    s_getpc_b64 s[6:7]
-; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1032-NEXT:    s_getpc_b64 s[4:5]
+; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s0, 0
-; GFX1032-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032-NEXT:  .LBB6_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_subrev_f32_e32 v2, s2, v2
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execz .LBB6_5
+; GFX1032-NEXT:  ; %bb.3:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1032-NEXT:  .LBB6_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB6_4
+; GFX1032-NEXT:  .LBB6_5:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1164-NEXT:    s_mov_b32 s12, s8
-; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
-; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1164-NEXT:    s_getpc_b64 s[4:5]
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1164-NEXT:    s_getpc_b64 s[6:7]
-; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1164-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1164-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-NEXT:  .LBB6_1: ; %ComputeLoop
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
+; GFX1164-NEXT:    s_add_i32 s2, s2, 32
+; GFX1164-NEXT:    s_min_u32 s2, s3, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    v_subrev_f32_e32 v2, s4, v2
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execz .LBB6_5
+; GFX1164-NEXT:  ; %bb.3:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1164-NEXT:  .LBB6_4: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB6_4
+; GFX1164-NEXT:  .LBB6_5:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_add_u32 s8, s4, 44
-; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1132-NEXT:    s_getpc_b64 s[6:7]
-; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1132-NEXT:    s_add_u32 s8, s34, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1132-NEXT:    s_getpc_b64 s[4:5]
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
@@ -2471,21 +3873,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1132-NEXT:    s_mov_b32 s0, 0
-; GFX1132-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-NEXT:  .LBB6_1: ; %ComputeLoop
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_subrev_f32_e32 v2, s2, v2
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_cbranch_execz .LBB6_5
+; GFX1132-NEXT:  ; %bb.3:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1132-NEXT:  .LBB6_4: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT:    s_cbranch_execnz .LBB6_4
+; GFX1132-NEXT:  .LBB6_5:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2496,11 +3923,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -2515,23 +3942,53 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT:    s_nop 0
+; GFX9-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT:    s_cbranch_execz .LBB6_3
+; GFX9-DPP-NEXT:  ; %bb.1:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX9-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT:    v_subrev_f32_e32 v0, s4, v1
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB6_2
+; GFX9-DPP-NEXT:  .LBB6_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2541,43 +3998,70 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_sub_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1064-DPP-NEXT:  ; %bb.1:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT:    v_sub_f32_e32 v0, v1, v5
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB6_2
+; GFX1064-DPP-NEXT:  .LBB6_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2587,94 +4071,148 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1032-DPP-NEXT:  ; %bb.1:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT:    v_sub_f32_e32 v0, v1, v5
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB6_2
+; GFX1032-DPP-NEXT:  .LBB6_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1164-DPP-NEXT:  ; %bb.1:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    global_load_b32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v3, v4, v5
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB6_2
+; GFX1164-DPP-NEXT:  .LBB6_3:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
-; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
@@ -2682,21 +4220,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1132-DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1132-DPP-NEXT:  ; %bb.1:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    global_load_b32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v3, v4, v5
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB6_2
+; GFX1132-DPP-NEXT:  .LBB6_3:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call float @div.float.value()
   %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic
@@ -2706,256 +4270,466 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 {
 ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s10, -1
+; GFX7LESS-NEXT:    s_mov_b32 s11, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s8, s8, s3
+; GFX7LESS-NEXT:    s_addc_u32 s9, s9, 0
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_3
+; GFX7LESS-NEXT:  ; %bb.1:
+; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX7LESS-NEXT:  .LBB7_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9-NEXT:    s_add_u32 s8, s8, s3
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB7_3
+; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
+; GFX9-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    v_sub_f32_e32 v1, v2, v3
+; GFX9-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX9-NEXT:  .LBB7_3:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s10, -1
+; GFX1064-NEXT:    s_mov_b32 s11, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s8, s8, s3
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1064-NEXT:    s_mov_b32 s2, 0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1064-NEXT:  ; %bb.1:
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1064-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX1064-NEXT:  .LBB7_3:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s10, -1
+; GFX1032-NEXT:    s_mov_b32 s11, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s8, s8, s3
+; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1032-NEXT:  ; %bb.1:
+; GFX1032-NEXT:    s_bcnt1_i32_b32 s4, s3
+; GFX1032-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1032-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:    s_mov_b32 s2, 0
-; GFX1032-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX1032-NEXT:  .LBB7_3:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, exec
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b32 s2, 0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    s_clause 0x1
+; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1164-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1164-NEXT:  ; %bb.1:
+; GFX1164-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1164-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX1164-NEXT:  .LBB7_3:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    s_clause 0x1
+; GFX1132-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1132-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1132-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1132-NEXT:  ; %bb.1:
+; GFX1132-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:    s_mov_b32 s2, 0
-; GFX1132-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1132-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX1132-NEXT:  .LBB7_3:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s10, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s8, s8, s3
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT:    s_cbranch_execz .LBB7_3
+; GFX9-DPP-NEXT:  ; %bb.1:
+; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
+; GFX9-DPP-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v1, v2, v3
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX9-DPP-NEXT:  .LBB7_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s10, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s11, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s8, s3
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1064-DPP-NEXT:  ; %bb.1:
+; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-DPP-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX1064-DPP-NEXT:  .LBB7_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s10, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s11, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s8, s3
+; GFX1032-DPP-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1032-DPP-NEXT:  ; %bb.1:
+; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s4, s3
+; GFX1032-DPP-NEXT:    s_mov_b32 s5, 0x43300000
+; GFX1032-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX1032-DPP-NEXT:  .LBB7_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_bcnt1_i32_b64 s2, exec
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT:    s_clause 0x1
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1164-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1164-DPP-NEXT:  ; %bb.1:
+; GFX1164-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX1164-DPP-NEXT:  .LBB7_3:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT:    s_clause 0x1
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
+; GFX1132-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
+; GFX1132-DPP-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1132-DPP-NEXT:  ; %bb.1:
+; GFX1132-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX1132-DPP-NEXT:  .LBB7_3:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 monotonic, align 4
   ret void
@@ -3021,11 +4795,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s36, s36, s11
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-NEXT:    s_mov_b32 s13, s9
-; GFX9-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -3040,23 +4814,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:  .LBB8_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
+; GFX9-NEXT:    s_add_i32 s2, s2, 32
+; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    v_subrev_f32_e32 v2, s4, v2
+; GFX9-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execz .LBB8_5
+; GFX9-NEXT:  ; %bb.3:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX9-NEXT:  .LBB8_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX9-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB8_4
+; GFX9-NEXT:  .LBB8_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
@@ -3066,43 +4864,67 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1064-NEXT:    s_mov_b32 s12, s8
-; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_mov_b32 s13, s9
-; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1064-NEXT:    s_getpc_b64 s[6:7]
-; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1064-NEXT:    s_getpc_b64 s[4:5]
+; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064-NEXT:  .LBB8_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
+; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT:    v_subrev_f32_e32 v2, s4, v2
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execz .LBB8_5
+; GFX1064-NEXT:  ; %bb.3:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1064-NEXT:  .LBB8_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB8_4
+; GFX1064-NEXT:  .LBB8_5:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
@@ -3112,94 +4934,142 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1032-NEXT:    s_mov_b32 s12, s8
-; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_mov_b32 s13, s9
-; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1032-NEXT:    s_getpc_b64 s[6:7]
-; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1032-NEXT:    s_getpc_b64 s[4:5]
+; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s0, 0
-; GFX1032-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032-NEXT:  .LBB8_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT:    v_subrev_f32_e32 v2, s2, v2
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execz .LBB8_5
+; GFX1032-NEXT:  ; %bb.3:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX1032-NEXT:  .LBB8_4: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB8_4
+; GFX1032-NEXT:  .LBB8_5:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
 ; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1164-NEXT:    s_mov_b32 s12, s8
-; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
-; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1164-NEXT:    s_getpc_b64 s[4:5]
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
 ; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1164-NEXT:    s_getpc_b64 s[6:7]
-; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1164-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1164-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-NEXT:  .LBB8_1: ; %ComputeLoop
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
+; GFX1164-NEXT:    s_add_i32 s2, s2, 32
+; GFX1164-NEXT:    s_min_u32 s2, s3, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    v_subrev_f32_e32 v2, s4, v2
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execz .LBB8_5
+; GFX1164-NEXT:  ; %bb.3:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1164-NEXT:  .LBB8_4: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB8_4
+; GFX1164-NEXT:  .LBB8_5:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_add_u32 s8, s4, 44
-; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1132-NEXT:    s_getpc_b64 s[6:7]
-; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1132-NEXT:    s_add_u32 s8, s34, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1132-NEXT:    s_getpc_b64 s[4:5]
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
@@ -3207,21 +5077,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1132-NEXT:    s_mov_b32 s0, 0
-; GFX1132-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-NEXT:  .LBB8_1: ; %ComputeLoop
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_subrev_f32_e32 v2, s2, v2
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT:    s_cbranch_execz .LBB8_5
+; GFX1132-NEXT:  ; %bb.3:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    global_load_b32 v1, v3, s[0:1]
+; GFX1132-NEXT:  .LBB8_4: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT:    s_cbranch_execnz .LBB8_4
+; GFX1132-NEXT:  .LBB8_5:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
@@ -3232,11 +5127,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
 ; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -3251,23 +5146,53 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT:    s_nop 0
+; GFX9-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT:    s_nop 1
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT:    v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT:    s_cbranch_execz .LBB8_3
+; GFX9-DPP-NEXT:  ; %bb.1:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX9-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT:    v_subrev_f32_e32 v0, s4, v1
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB8_2
+; GFX9-DPP-NEXT:  .LBB8_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
@@ -3277,43 +5202,70 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_sub_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1064-DPP-NEXT:  ; %bb.1:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT:    v_sub_f32_e32 v0, v1, v5
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB8_2
+; GFX1064-DPP-NEXT:  .LBB8_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
@@ -3323,94 +5275,148 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT:    v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT:    v_sub_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1032-DPP-NEXT:  ; %bb.1:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT:    v_sub_f32_e32 v0, v1, v5
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB8_2
+; GFX1032-DPP-NEXT:  .LBB8_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
 ; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
-; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
-; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
+; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1164-DPP-NEXT:  ; %bb.1:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    global_load_b32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v3, v4, v5
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB8_2
+; GFX1164-DPP-NEXT:  .LBB8_3:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
-; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
-; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
@@ -3418,21 +5424,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
-; GFX1132-DPP-NEXT:    s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT:    v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1132-DPP-NEXT:  ; %bb.1:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    global_load_b32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v3, v4, v5
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB8_2
+; GFX1132-DPP-NEXT:  .LBB8_3:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call float @div.float.value()
   %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue monotonic, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index cf986e7d314a20..84f67b3faac3c0 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -186,6 +186,10 @@
 ; GCN-O1-NEXT:        Cycle Info Analysis
 ; GCN-O1-NEXT:    FunctionPass Manager
 ; GCN-O1-NEXT:      Infer address spaces
+; GCN-O1-NEXT:      Dominator Tree Construction
+; GCN-O1-NEXT:      Cycle Info Analysis
+; GCN-O1-NEXT:      Uniformity Analysis
+; GCN-O1-NEXT:      AMDGPU atomic optimizations
 ; GCN-O1-NEXT:      Expand Atomic instructions
 ; GCN-O1-NEXT:      AMDGPU Promote Alloca
 ; GCN-O1-NEXT:      Dominator Tree Construction
@@ -241,14 +245,11 @@
 ; GCN-O1-NEXT:        Cycle Info Analysis
 ; GCN-O1-NEXT:        Uniformity Analysis
 ; GCN-O1-NEXT:        AMDGPU IR late optimizations
-; GCN-O1-NEXT:        AMDGPU atomic optimizations
 ; GCN-O1-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-NEXT:        Function Alias Analysis Results
 ; GCN-O1-NEXT:        Natural Loop Information
 ; GCN-O1-NEXT:        Code sinking
 ; GCN-O1-NEXT:        Post-Dominator Tree Construction
-; GCN-O1-NEXT:        Cycle Info Analysis
-; GCN-O1-NEXT:        Uniformity Analysis
 ; GCN-O1-NEXT:        Unify divergent function exit nodes
 ; GCN-O1-NEXT:        Lazy Value Information Analysis
 ; GCN-O1-NEXT:        Lower SwitchInst's to branches
@@ -457,6 +458,10 @@
 ; GCN-O1-OPTS-NEXT:        Cycle Info Analysis
 ; GCN-O1-OPTS-NEXT:    FunctionPass Manager
 ; GCN-O1-OPTS-NEXT:      Infer address spaces
+; GCN-O1-OPTS-NEXT:      Dominator Tree Construction
+; GCN-O1-OPTS-NEXT:      Cycle Info Analysis
+; GCN-O1-OPTS-NEXT:      Uniformity Analysis
+; GCN-O1-OPTS-NEXT:      AMDGPU atomic optimizations
 ; GCN-O1-OPTS-NEXT:      Expand Atomic instructions
 ; GCN-O1-OPTS-NEXT:      AMDGPU Promote Alloca
 ; GCN-O1-OPTS-NEXT:      Dominator Tree Construction
@@ -526,14 +531,11 @@
 ; GCN-O1-OPTS-NEXT:        Cycle Info Analysis
 ; GCN-O1-OPTS-NEXT:        Uniformity Analysis
 ; GCN-O1-OPTS-NEXT:        AMDGPU IR late optimizations
-; GCN-O1-OPTS-NEXT:        AMDGPU atomic optimizations
 ; GCN-O1-OPTS-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-OPTS-NEXT:        Function Alias Analysis Results
 ; GCN-O1-OPTS-NEXT:        Natural Loop Information
 ; GCN-O1-OPTS-NEXT:        Code sinking
 ; GCN-O1-OPTS-NEXT:        Post-Dominator Tree Construction
-; GCN-O1-OPTS-NEXT:        Cycle Info Analysis
-; GCN-O1-OPTS-NEXT:        Uniformity Analysis
 ; GCN-O1-OPTS-NEXT:        Unify divergent function exit nodes
 ; GCN-O1-OPTS-NEXT:        Lazy Value Information Analysis
 ; GCN-O1-OPTS-NEXT:        Lower SwitchInst's to branches
@@ -750,6 +752,10 @@
 ; GCN-O2-NEXT:        Cycle Info Analysis
 ; GCN-O2-NEXT:    FunctionPass Manager
 ; GCN-O2-NEXT:      Infer address spaces
+; GCN-O2-NEXT:      Dominator Tree Construction
+; GCN-O2-NEXT:      Cycle Info Analysis
+; GCN-O2-NEXT:      Uniformity Analysis
+; GCN-O2-NEXT:      AMDGPU atomic optimizations
 ; GCN-O2-NEXT:      Expand Atomic instructions
 ; GCN-O2-NEXT:      AMDGPU Promote Alloca
 ; GCN-O2-NEXT:      Dominator Tree Construction
@@ -827,14 +833,11 @@
 ; GCN-O2-NEXT:        Cycle Info Analysis
 ; GCN-O2-NEXT:        Uniformity Analysis
 ; GCN-O2-NEXT:        AMDGPU IR late optimizations
-; GCN-O2-NEXT:        AMDGPU atomic optimizations
 ; GCN-O2-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O2-NEXT:        Function Alias Analysis Results
 ; GCN-O2-NEXT:        Natural Loop Information
 ; GCN-O2-NEXT:        Code sinking
 ; GCN-O2-NEXT:        Post-Dominator Tree Construction
-; GCN-O2-NEXT:        Cycle Info Analysis
-; GCN-O2-NEXT:        Uniformity Analysis
 ; GCN-O2-NEXT:        Unify divergent function exit nodes
 ; GCN-O2-NEXT:        Lazy Value Information Analysis
 ; GCN-O2-NEXT:        Lower SwitchInst's to branches
@@ -1053,6 +1056,10 @@
 ; GCN-O3-NEXT:        Cycle Info Analysis
 ; GCN-O3-NEXT:    FunctionPass Manager
 ; GCN-O3-NEXT:      Infer address spaces
+; GCN-O3-NEXT:      Dominator Tree Construction
+; GCN-O3-NEXT:      Cycle Info Analysis
+; GCN-O3-NEXT:      Uniformity Analysis
+; GCN-O3-NEXT:      AMDGPU atomic optimizations
 ; GCN-O3-NEXT:      Expand Atomic instructions
 ; GCN-O3-NEXT:      AMDGPU Promote Alloca
 ; GCN-O3-NEXT:      Dominator Tree Construction
@@ -1142,14 +1149,11 @@
 ; GCN-O3-NEXT:        Cycle Info Analysis
 ; GCN-O3-NEXT:        Uniformity Analysis
 ; GCN-O3-NEXT:        AMDGPU IR late optimizations
-; GCN-O3-NEXT:        AMDGPU atomic optimizations
 ; GCN-O3-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O3-NEXT:        Function Alias Analysis Results
 ; GCN-O3-NEXT:        Natural Loop Information
 ; GCN-O3-NEXT:        Code sinking
 ; GCN-O3-NEXT:        Post-Dominator Tree Construction
-; GCN-O3-NEXT:        Cycle Info Analysis
-; GCN-O3-NEXT:        Uniformity Analysis
 ; GCN-O3-NEXT:        Unify divergent function exit nodes
 ; GCN-O3-NEXT:        Lazy Value Information Analysis
 ; GCN-O3-NEXT:        Lower SwitchInst's to branches

diff  --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index 039fc445aa590f..3641bd4ef865dc 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -319,53 +319,78 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; GFX7-LABEL: lds_ds_fadd:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
-; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_mov_b64 s[6:7], exec
+; GFX7-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
+; GFX7-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshl_b32 s4, s3, 3
-; GFX7-NEXT:    s_add_i32 s4, s4, 32
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    ds_read_b32 v0, v0
 ; GFX7-NEXT:    s_add_i32 s3, s3, 4
-; GFX7-NEXT:    s_lshl_b32 s6, s3, 3
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; implicit-def: $vgpr1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX7-NEXT:    s_cbranch_execz .LBB2_4
+; GFX7-NEXT:  ; %bb.1:
+; GFX7-NEXT:    s_lshl_b32 s8, s3, 3
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    ds_read_b32 v1, v1
+; GFX7-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX7-NEXT:    v_cvt_f32_ubyte0_e32 v2, s6
+; GFX7-NEXT:    v_mul_f32_e32 v2, 0x42280000, v2
+; GFX7-NEXT:    s_mov_b64 s[6:7], 0
+; GFX7-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-NEXT:    v_add_f32_e32 v2, 0x42280000, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_add_f32_e32 v4, v3, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v0, v0, v1, v2
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v1, v3, v4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX7-NEXT:  ; %bb.3: ; %Flow15
+; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT:  .LBB2_4: ; %Flow16
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_mov_b64 s[6:7], exec
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX7-NEXT:    v_mbcnt_lo_u32_b32_e64 v1, s6, 0
+; GFX7-NEXT:    v_mbcnt_hi_u32_b32_e32 v1, s7, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX7-NEXT:    s_cbranch_execz .LBB2_7
+; GFX7-NEXT:  ; %bb.5:
 ; GFX7-NEXT:    s_lshl_b32 s3, s3, 4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    ds_read_b32 v1, v1
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB2_3: ; %atomicrmw.start2
+; GFX7-NEXT:    ds_read_b32 v2, v1
+; GFX7-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX7-NEXT:    v_cvt_f32_ubyte0_e32 v1, s6
+; GFX7-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
+; GFX7-NEXT:    s_mov_b64 s[6:7], 0
+; GFX7-NEXT:  .LBB2_6: ; %atomicrmw.start2
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_f32_e32 v2, 0x42280000, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    v_add_f32_e32 v3, v2, v1
+; GFX7-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v3, v1, v2
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v4, v2, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB2_3
-; GFX7-NEXT:  ; %bb.4: ; %atomicrmw.end1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    s_cbranch_execnz .LBB2_6
+; GFX7-NEXT:  .LBB2_7: ; %Flow14
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7-NEXT:    ds_read_b32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, s8, v0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB2_5: ; %atomicrmw.start8
+; GFX7-NEXT:  .LBB2_8: ; %atomicrmw.start8
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v1
@@ -377,8 +402,8 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB2_5
-; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end7
+; GFX7-NEXT:    s_cbranch_execnz .LBB2_8
+; GFX7-NEXT:  ; %bb.9: ; %atomicrmw.end7
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
@@ -390,52 +415,78 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; GFX8-LABEL: lds_ds_fadd:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
-; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshl_b32 s4, s3, 3
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    ds_read_b32 v0, v0 offset:32
 ; GFX8-NEXT:    s_add_i32 s3, s3, 4
-; GFX8-NEXT:    s_lshl_b32 s6, s3, 3
-; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT:    s_cbranch_execz .LBB2_4
+; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_lshl_b32 s8, s3, 3
+; GFX8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX8-NEXT:    ds_read_b32 v1, v1
+; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v2, s6
+; GFX8-NEXT:    v_mul_f32_e32 v2, 0x42280000, v2
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_add_f32_e32 v2, 0x42280000, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX8-NEXT:    v_add_f32_e32 v4, v3, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v0, v0, v1, v2
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v1, v1, v3, v4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX8-NEXT:  ; %bb.3: ; %Flow17
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:  .LBB2_4: ; %Flow18
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32_e64 v1, s6, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32_e32 v1, s7, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT:    s_cbranch_execz .LBB2_7
+; GFX8-NEXT:  ; %bb.5:
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    ds_read_b32 v1, v1
-; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB2_3: ; %atomicrmw.start2
+; GFX8-NEXT:    ds_read_b32 v2, v1
+; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v1, s6
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:  .LBB2_6: ; %atomicrmw.start2
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_add_f32_e32 v2, 0x42280000, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_add_f32_e32 v3, v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v3, v1, v2
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v4, v2, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB2_3
-; GFX8-NEXT:  ; %bb.4: ; %atomicrmw.end1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB2_6
+; GFX8-NEXT:  .LBB2_7: ; %Flow16
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:    ds_read_b32 v1, v1
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, s8, v0
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB2_5: ; %atomicrmw.start8
+; GFX8-NEXT:  .LBB2_8: ; %atomicrmw.start8
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v1
@@ -447,8 +498,8 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB2_5
-; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end7
+; GFX8-NEXT:    s_cbranch_execnz .LBB2_8
+; GFX8-NEXT:  ; %bb.9: ; %atomicrmw.end7
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
@@ -635,51 +686,76 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa
 ; GFX7-LABEL: lds_ds_fadd_one_as:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
-; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_mov_b64 s[6:7], exec
+; GFX7-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
+; GFX7-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshl_b32 s4, s3, 3
-; GFX7-NEXT:    s_add_i32 s4, s4, 32
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    ds_read_b32 v0, v0
 ; GFX7-NEXT:    s_add_i32 s3, s3, 4
-; GFX7-NEXT:    s_lshl_b32 s6, s3, 3
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; implicit-def: $vgpr1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX7-NEXT:    s_cbranch_execz .LBB3_4
+; GFX7-NEXT:  ; %bb.1:
+; GFX7-NEXT:    s_lshl_b32 s8, s3, 3
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    ds_read_b32 v1, v1
+; GFX7-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX7-NEXT:    v_cvt_f32_ubyte0_e32 v2, s6
+; GFX7-NEXT:    v_mul_f32_e32 v2, 0x42280000, v2
+; GFX7-NEXT:    s_mov_b64 s[6:7], 0
+; GFX7-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-NEXT:    v_add_f32_e32 v2, 0x42280000, v1
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v0, v0, v1, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_add_f32_e32 v4, v3, v2
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v1, v3, v4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    s_cbranch_execnz .LBB3_2
+; GFX7-NEXT:  ; %bb.3: ; %Flow15
+; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT:  .LBB3_4: ; %Flow16
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_mov_b64 s[6:7], exec
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX7-NEXT:    v_mbcnt_lo_u32_b32_e64 v1, s6, 0
+; GFX7-NEXT:    v_mbcnt_hi_u32_b32_e32 v1, s7, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX7-NEXT:    s_cbranch_execz .LBB3_7
+; GFX7-NEXT:  ; %bb.5:
 ; GFX7-NEXT:    s_lshl_b32 s3, s3, 4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    ds_read_b32 v1, v1
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB3_3: ; %atomicrmw.start2
+; GFX7-NEXT:    ds_read_b32 v2, v1
+; GFX7-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX7-NEXT:    v_cvt_f32_ubyte0_e32 v1, s6
+; GFX7-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
+; GFX7-NEXT:    s_mov_b64 s[6:7], 0
+; GFX7-NEXT:  .LBB3_6: ; %atomicrmw.start2
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_f32_e32 v2, 0x42280000, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v3, v1, v2
+; GFX7-NEXT:    v_add_f32_e32 v3, v2, v1
+; GFX7-NEXT:    v_mov_b32_e32 v4, s3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v4, v2, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB3_3
-; GFX7-NEXT:  ; %bb.4: ; %atomicrmw.end1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    s_cbranch_execnz .LBB3_6
+; GFX7-NEXT:  .LBB3_7: ; %Flow14
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7-NEXT:    ds_read_b32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, s8, v0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB3_5: ; %atomicrmw.start8
+; GFX7-NEXT:  .LBB3_8: ; %atomicrmw.start8
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v1
@@ -690,8 +766,8 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB3_5
-; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end7
+; GFX7-NEXT:    s_cbranch_execnz .LBB3_8
+; GFX7-NEXT:  ; %bb.9: ; %atomicrmw.end7
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
@@ -703,50 +779,76 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa
 ; GFX8-LABEL: lds_ds_fadd_one_as:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
-; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshl_b32 s4, s3, 3
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    ds_read_b32 v0, v0 offset:32
 ; GFX8-NEXT:    s_add_i32 s3, s3, 4
-; GFX8-NEXT:    s_lshl_b32 s6, s3, 3
-; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT:    s_cbranch_execz .LBB3_4
+; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_lshl_b32 s8, s3, 3
+; GFX8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX8-NEXT:    ds_read_b32 v1, v1
+; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v2, s6
+; GFX8-NEXT:    v_mul_f32_e32 v2, 0x42280000, v2
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:  .LBB3_2: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_add_f32_e32 v2, 0x42280000, v1
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v0, v0, v1, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX8-NEXT:    v_add_f32_e32 v4, v3, v2
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v1, v1, v3, v4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB3_2
+; GFX8-NEXT:  ; %bb.3: ; %Flow17
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:  .LBB3_4: ; %Flow18
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32_e64 v1, s6, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32_e32 v1, s7, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT:    s_cbranch_execz .LBB3_7
+; GFX8-NEXT:  ; %bb.5:
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    ds_read_b32 v1, v1
-; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB3_3: ; %atomicrmw.start2
+; GFX8-NEXT:    ds_read_b32 v2, v1
+; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v1, s6
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:  .LBB3_6: ; %atomicrmw.start2
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_add_f32_e32 v2, 0x42280000, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v3, v1, v2
+; GFX8-NEXT:    v_add_f32_e32 v3, v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v4, v2, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB3_3
-; GFX8-NEXT:  ; %bb.4: ; %atomicrmw.end1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB3_6
+; GFX8-NEXT:  .LBB3_7: ; %Flow16
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:    ds_read_b32 v1, v1
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, s8, v0
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB3_5: ; %atomicrmw.start8
+; GFX8-NEXT:  .LBB3_8: ; %atomicrmw.start8
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v1
@@ -757,8 +859,8 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB3_5
-; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end7
+; GFX8-NEXT:    s_cbranch_execnz .LBB3_8
+; GFX8-NEXT:  ; %bb.9: ; %atomicrmw.end7
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000


        


More information about the llvm-commits mailing list