[llvm] 6ef6c95 - [AMDGPU] Reorder atomic optimizer to avoid CAS loop.
Pravin Jagtap via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 30 09:08:13 PDT 2023
Author: Pravin Jagtap
Date: 2023-08-30T12:05:21-04:00
New Revision: 6ef6c954c6dcd34b99897cc00da9bf1b651398b5
URL: https://github.com/llvm/llvm-project/commit/6ef6c954c6dcd34b99897cc00da9bf1b651398b5
DIFF: https://github.com/llvm/llvm-project/commit/6ef6c954c6dcd34b99897cc00da9bf1b651398b5.diff
LOG: [AMDGPU] Reorder atomic optimizer to avoid CAS loop.
Expand-Atomic pass emits the CAS loop for FP operations
which limits the optimizations offered by atomic optimizer.
Moving atomic optimizer before expand-atomics allows
better codegen.
Reviewed By: arsenm, #amdgpu
Differential Revision: https://reviews.llvm.org/D157265
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll
llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 43c9f183f17ba0..2e6b29d986af75 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1000,6 +1000,13 @@ void AMDGPUPassConfig::addIRPasses() {
if (TM.getOptLevel() > CodeGenOpt::None)
addPass(createInferAddressSpacesPass());
+ // Run atomic optimizer before Atomic Expand
+ if ((TM.getTargetTriple().getArch() == Triple::amdgcn) &&
+ (TM.getOptLevel() >= CodeGenOpt::Less) &&
+ (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
+ addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy));
+ }
+
addPass(createAtomicExpandPass());
if (TM.getOptLevel() > CodeGenOpt::None) {
@@ -1124,11 +1131,6 @@ bool GCNPassConfig::addPreISel() {
if (TM->getOptLevel() > CodeGenOpt::None)
addPass(createAMDGPULateCodeGenPreparePass());
- if ((TM->getOptLevel() >= CodeGenOpt::Less) &&
- (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
- addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy));
- }
-
if (TM->getOptLevel() > CodeGenOpt::None)
addPass(createSinkingPass());
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
index 9954e67a1c32d3..5516741e57e929 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
@@ -141,7 +141,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1)
define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, float %data) #0 {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw
; GFX90A_GFX940: bb.1 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+ ; GFX90A_GFX940-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000)
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
@@ -149,11 +149,11 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX940-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
- ; GFX90A_GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_BRANCH %bb.2
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: bb.2 (%ir-block.5):
- ; GFX90A_GFX940-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000)
+ ; GFX90A_GFX940-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec
; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0
@@ -196,29 +196,22 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec
; GFX90A_GFX940-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; GFX90A_GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec
- ; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_BRANCH %bb.3
; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: bb.3 (%ir-block.36):
- ; GFX90A_GFX940-NEXT: successors: %bb.5(0x80000000)
+ ; GFX90A_GFX940-NEXT: bb.3 (%ir-block.35):
+ ; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000)
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_BRANCH %bb.5
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: bb.4.Flow:
- ; GFX90A_GFX940-NEXT: successors: %bb.6(0x80000000)
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX90A_GFX940-NEXT: S_BRANCH %bb.6
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.38):
- ; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000)
+ ; GFX90A_GFX940-NEXT: successors: %bb.5(0x80000000)
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX90A_GFX940-NEXT: S_BRANCH %bb.4
; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: bb.6 (%ir-block.39):
+ ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.37):
+ ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
index 14c058e3d07dc9..5787392a5901ac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
@@ -206,7 +206,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.3
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: bb.3 (%ir-block.39):
+ ; GFX11-NEXT: bb.3 (%ir-block.36):
; GFX11-NEXT: successors: %bb.5(0x80000000)
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -220,7 +220,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.6
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: bb.5 (%ir-block.42):
+ ; GFX11-NEXT: bb.5 (%ir-block.39):
; GFX11-NEXT: successors: %bb.4(0x80000000)
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.3, [[DEF]], %bb.2
@@ -231,7 +231,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY15]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.4
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: bb.6 (%ir-block.50):
+ ; GFX11-NEXT: bb.6 (%ir-block.47):
; GFX11-NEXT: $vgpr0 = COPY [[PHI]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
diff --git a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll
index 6805683d46ee38..3a8041f7a37e12 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll
@@ -14,8 +14,7 @@ define amdgpu_ps void @main(i32 %arg) {
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s2, 0
; GFX10-NEXT: s_branch .LBB0_2
-; GFX10-NEXT: .LBB0_1: ; %Flow
-; GFX10-NEXT: ; in Loop: Header=BB0_2 Depth=1
+; GFX10-NEXT: .LBB0_1: ; in Loop: Header=BB0_2 Depth=1
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX10-NEXT: s_and_b32 s0, exec_lo, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index dca48a5c3c9587..1ebd864e7e03aa 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -41,6 +41,7 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
; GFX7-NEXT: .LBB0_4: ; %Flow
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_wqm_b64 s[4:5], -1
+; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5]
; GFX7-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; GFX7-NEXT: s_cbranch_vccnz .LBB0_6
; GFX7-NEXT: ; %bb.5: ; %if
@@ -75,6 +76,7 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
; GFX89-NEXT: .LBB0_4: ; %Flow
; GFX89-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX89-NEXT: s_wqm_b64 s[4:5], -1
+; GFX89-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5]
; GFX89-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; GFX89-NEXT: s_cbranch_vccnz .LBB0_6
; GFX89-NEXT: ; %bb.5: ; %if
@@ -110,6 +112,7 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
; GFX1064-NEXT: .LBB0_4: ; %Flow
; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1064-NEXT: s_wqm_b64 s[4:5], -1
+; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5]
; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; GFX1064-NEXT: s_cbranch_vccnz .LBB0_6
; GFX1064-NEXT: ; %bb.5: ; %if
@@ -144,6 +147,7 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
; GFX1032-NEXT: .LBB0_4: ; %Flow
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1032-NEXT: s_wqm_b32 s4, -1
+; GFX1032-NEXT: s_and_b32 s4, s4, s4
; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_vccnz .LBB0_6
; GFX1032-NEXT: ; %bb.5: ; %if
@@ -182,7 +186,8 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
; GFX1164-NEXT: .LBB0_4: ; %Flow
; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1164-NEXT: s_wqm_b64 s[4:5], -1
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5]
; GFX1164-NEXT: s_and_not1_b64 vcc, exec, s[4:5]
; GFX1164-NEXT: s_cbranch_vccnz .LBB0_6
; GFX1164-NEXT: ; %bb.5: ; %if
@@ -222,7 +227,8 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
; GFX1132-NEXT: .LBB0_4: ; %Flow
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1132-NEXT: s_wqm_b32 s4, -1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_b32 s4, s4, s4
; GFX1132-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_vccnz .LBB0_6
; GFX1132-NEXT: ; %bb.5: ; %if
@@ -307,6 +313,7 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX8-NEXT: .LBB1_4: ; %Flow
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_wqm_b64 s[4:5], -1
+; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5]
; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; GFX8-NEXT: s_cbranch_vccnz .LBB1_6
; GFX8-NEXT: ; %bb.5: ; %if
@@ -363,6 +370,7 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX9-NEXT: .LBB1_4: ; %Flow
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_wqm_b64 s[4:5], -1
+; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5]
; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; GFX9-NEXT: s_cbranch_vccnz .LBB1_6
; GFX9-NEXT: ; %bb.5: ; %if
@@ -426,6 +434,7 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1064-NEXT: .LBB1_4: ; %Flow
; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1064-NEXT: s_wqm_b64 s[4:5], -1
+; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5]
; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; GFX1064-NEXT: s_cbranch_vccnz .LBB1_6
; GFX1064-NEXT: ; %bb.5: ; %if
@@ -480,6 +489,7 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1032-NEXT: .LBB1_4: ; %Flow
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1032-NEXT: s_wqm_b32 s4, -1
+; GFX1032-NEXT: s_and_b32 s4, s4, s4
; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_vccnz .LBB1_6
; GFX1032-NEXT: ; %bb.5: ; %if
@@ -552,7 +562,8 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1164-NEXT: .LBB1_4: ; %Flow
; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1164-NEXT: s_wqm_b64 s[4:5], -1
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5]
; GFX1164-NEXT: s_and_not1_b64 vcc, exec, s[4:5]
; GFX1164-NEXT: s_cbranch_vccnz .LBB1_6
; GFX1164-NEXT: ; %bb.5: ; %if
@@ -616,7 +627,8 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1132-NEXT: .LBB1_4: ; %Flow
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1132-NEXT: s_wqm_b32 s4, -1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_b32 s4, s4, s4
; GFX1132-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_vccnz .LBB1_6
; GFX1132-NEXT: ; %bb.5: ; %if
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
index d9d2839635264b..c2b51467edd08d 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
@@ -147,7 +147,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1)
define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, float %data) #0 {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw
; GFX90A_GFX940: bb.0 (%ir-block.0):
- ; GFX90A_GFX940-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
+ ; GFX90A_GFX940-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000)
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -156,11 +156,11 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE]]
; GFX90A_GFX940-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
- ; GFX90A_GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_BRANCH %bb.1
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: bb.1 (%ir-block.5):
- ; GFX90A_GFX940-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+ ; GFX90A_GFX940-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec
; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
@@ -188,30 +188,23 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: early-clobber %1:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec
; GFX90A_GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
- ; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_BRANCH %bb.2
; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: bb.2 (%ir-block.36):
- ; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000)
+ ; GFX90A_GFX940-NEXT: bb.2 (%ir-block.35):
+ ; GFX90A_GFX940-NEXT: successors: %bb.3(0x80000000)
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY %1
; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_1]], [[COPY8]], [[COPY3]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
- ; GFX90A_GFX940-NEXT: S_BRANCH %bb.4
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: bb.3.Flow:
- ; GFX90A_GFX940-NEXT: successors: %bb.5(0x80000000)
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX90A_GFX940-NEXT: S_BRANCH %bb.5
- ; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: bb.4 (%ir-block.38):
- ; GFX90A_GFX940-NEXT: successors: %bb.3(0x80000000)
+ ; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000)
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX90A_GFX940-NEXT: S_BRANCH %bb.3
; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.39):
+ ; GFX90A_GFX940-NEXT: bb.4 (%ir-block.37):
+ ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
index a545c024fb9eee..c559750d4f1b46 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
@@ -203,7 +203,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.2
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: bb.2 (%ir-block.39):
+ ; GFX11-NEXT: bb.2 (%ir-block.36):
; GFX11-NEXT: successors: %bb.4(0x80000000)
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -218,7 +218,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.5
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: bb.4 (%ir-block.42):
+ ; GFX11-NEXT: bb.4 (%ir-block.39):
; GFX11-NEXT: successors: %bb.3(0x80000000)
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
@@ -228,7 +228,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.3
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: bb.5 (%ir-block.50):
+ ; GFX11-NEXT: bb.5 (%ir-block.47):
; GFX11-NEXT: $vgpr0 = COPY [[PHI]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
index debcc67b29a5d5..0499f83de1c92a 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
@@ -8,138 +8,216 @@
define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0 {
; GFX900-LABEL: global_atomic_fadd_ret_f32:
; GFX900: ; %bb.0:
+; GFX900-NEXT: s_mov_b64 s[4:5], exec
+; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT: ; implicit-def: $vgpr1
+; GFX900-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX900-NEXT: s_cbranch_execz .LBB0_4
+; GFX900-NEXT: ; %bb.1:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX900-NEXT: s_mov_b64 s[2:3], 0
-; GFX900-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
+; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
+; GFX900-NEXT: s_mov_b64 s[4:5], 0
+; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: v_mov_b32_e32 v1, s4
-; GFX900-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX900-NEXT: v_mov_b32_e32 v1, s6
+; GFX900-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT: v_mov_b32_e32 v2, v1
-; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v2
+; GFX900-NEXT: v_mov_b32_e32 v5, v1
+; GFX900-NEXT: v_add_f32_e32 v4, v5, v2
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX900-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_wbinvl1_vol
-; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX900-NEXT: s_cbranch_execnz .LBB0_1
-; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT: s_cbranch_execnz .LBB0_2
+; GFX900-NEXT: ; %bb.3: ; %Flow
+; GFX900-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT: .LBB0_4: ; %Flow1
; GFX900-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX900-NEXT: global_store_dword v[0:1], v1, off
+; GFX900-NEXT: v_readfirstlane_b32 s0, v1
+; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX900-NEXT: v_mad_f32 v0, v0, 4.0, s0
+; GFX900-NEXT: global_store_dword v[0:1], v0, off
; GFX900-NEXT: s_endpgm
;
; GFX908-LABEL: global_atomic_fadd_ret_f32:
; GFX908: ; %bb.0:
+; GFX908-NEXT: s_mov_b64 s[4:5], exec
+; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX908-NEXT: ; implicit-def: $vgpr1
+; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX908-NEXT: s_cbranch_execz .LBB0_4
+; GFX908-NEXT: ; %bb.1:
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX908-NEXT: s_mov_b64 s[2:3], 0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0
+; GFX908-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
+; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mul_f32_e32 v2, 4.0, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX908-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX908-NEXT: v_mov_b32_e32 v3, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, s4
-; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX908-NEXT: v_mov_b32_e32 v1, s6
+; GFX908-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v2
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX908-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1_vol
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX908-NEXT: s_cbranch_execnz .LBB0_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB0_2
+; GFX908-NEXT: ; %bb.3: ; %Flow
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: .LBB0_4: ; %Flow1
; GFX908-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX908-NEXT: global_store_dword v[0:1], v1, off
+; GFX908-NEXT: v_readfirstlane_b32 s0, v1
+; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX908-NEXT: v_mad_f32 v0, v0, 4.0, s0
+; GFX908-NEXT: global_store_dword v[0:1], v0, off
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: global_atomic_fadd_ret_f32:
; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: ; implicit-def: $vgpr1
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB0_4
+; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
+; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_mul_f32_e32 v2, 4.0, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, s4
-; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX90A-NEXT: v_mov_b32_e32 v1, s6
+; GFX90A-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: v_add_f32_e32 v2, 4.0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_cbranch_execnz .LBB0_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB0_2
+; GFX90A-NEXT: ; %bb.3: ; %Flow
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: .LBB0_4: ; %Flow1
; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: global_store_dword v[0:1], v1, off
+; GFX90A-NEXT: v_readfirstlane_b32 s0, v1
+; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX90A-NEXT: v_mad_f32 v0, v0, 4.0, s0
+; GFX90A-NEXT: global_store_dword v[0:1], v0, off
; GFX90A-NEXT: s_endpgm
;
; GFX10-LABEL: global_atomic_fadd_ret_f32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b32 s4, exec_lo
+; GFX10-NEXT: s_mov_b32 s3, 0
+; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX10-NEXT: ; implicit-def: $vgpr1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB0_4
+; GFX10-NEXT: ; %bb.1:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s4
+; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-NEXT: s_load_dword s5, s[0:1], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, s2
-; GFX10-NEXT: s_mov_b32 s2, 0
-; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v2
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
+; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX10-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT: s_cbranch_execnz .LBB0_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT: s_cbranch_execnz .LBB0_2
+; GFX10-NEXT: ; %bb.3: ; %Flow
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT: .LBB0_4: ; %Flow1
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT: global_store_dword v[0:1], v1, off
+; GFX10-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX10-NEXT: v_mad_f32 v0, v0, 4.0, s0
+; GFX10-NEXT: global_store_dword v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_atomic_fadd_ret_f32:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 s4, exec_lo
+; GFX11-NEXT: s_mov_b32 s3, 0
+; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: ; implicit-def: $vgpr1
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB0_4
+; GFX11-NEXT: ; %bb.1:
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX11-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s4
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-NEXT: s_mov_b32 s2, 0
-; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX11-NEXT: v_dual_mul_f32 v2, 4.0, v1 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v1
+; GFX11-NEXT: v_add_f32_e32 v4, v5, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v3, v[4:5], s[0:1] glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX11-NEXT: s_cbranch_execnz .LBB0_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX11-NEXT: s_cbranch_execnz .LBB0_2
+; GFX11-NEXT: ; %bb.3: ; %Flow
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX11-NEXT: .LBB0_4: ; %Flow1
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11-NEXT: global_store_b32 v[0:1], v1, off
+; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11-NEXT: v_mul_f32_e32 v0, 4.0, v0
+; GFX11-NEXT: v_add_f32_e32 v0, s0, v0
+; GFX11-NEXT: global_store_b32 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -151,54 +229,88 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0
define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr) #2 {
; GFX900-LABEL: global_atomic_fadd_ret_f32_ieee:
; GFX900: ; %bb.0:
+; GFX900-NEXT: s_mov_b64 s[4:5], exec
+; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT: ; implicit-def: $vgpr1
+; GFX900-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX900-NEXT: s_cbranch_execz .LBB1_4
+; GFX900-NEXT: ; %bb.1:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX900-NEXT: s_mov_b64 s[2:3], 0
-; GFX900-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
+; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
+; GFX900-NEXT: s_mov_b64 s[4:5], 0
+; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: v_mov_b32_e32 v1, s4
-; GFX900-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX900-NEXT: v_mov_b32_e32 v1, s6
+; GFX900-NEXT: .LBB1_2: ; %atomicrmw.start
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT: v_mov_b32_e32 v2, v1
-; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v2
+; GFX900-NEXT: v_mov_b32_e32 v5, v1
+; GFX900-NEXT: v_add_f32_e32 v4, v5, v2
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX900-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_wbinvl1_vol
-; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX900-NEXT: s_cbranch_execnz .LBB1_1
-; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT: s_cbranch_execnz .LBB1_2
+; GFX900-NEXT: ; %bb.3: ; %Flow
+; GFX900-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT: .LBB1_4: ; %Flow1
; GFX900-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX900-NEXT: global_store_dword v[0:1], v1, off
+; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX900-NEXT: v_readfirstlane_b32 s0, v1
+; GFX900-NEXT: v_mul_f32_e32 v0, 4.0, v0
+; GFX900-NEXT: v_add_f32_e32 v0, s0, v0
+; GFX900-NEXT: global_store_dword v[0:1], v0, off
; GFX900-NEXT: s_endpgm
;
; GFX908-LABEL: global_atomic_fadd_ret_f32_ieee:
; GFX908: ; %bb.0:
+; GFX908-NEXT: s_mov_b64 s[4:5], exec
+; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX908-NEXT: ; implicit-def: $vgpr1
+; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX908-NEXT: s_cbranch_execz .LBB1_4
+; GFX908-NEXT: ; %bb.1:
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX908-NEXT: s_mov_b64 s[2:3], 0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0
+; GFX908-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
+; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mul_f32_e32 v2, 4.0, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX908-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX908-NEXT: v_mov_b32_e32 v3, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, s4
-; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX908-NEXT: v_mov_b32_e32 v1, s6
+; GFX908-NEXT: .LBB1_2: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v2
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX908-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1_vol
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX908-NEXT: s_cbranch_execnz .LBB1_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB1_2
+; GFX908-NEXT: ; %bb.3: ; %Flow
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: .LBB1_4: ; %Flow1
; GFX908-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX908-NEXT: global_store_dword v[0:1], v1, off
+; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX908-NEXT: v_readfirstlane_b32 s0, v1
+; GFX908-NEXT: v_mul_f32_e32 v0, 4.0, v0
+; GFX908-NEXT: v_add_f32_e32 v0, s0, v0
+; GFX908-NEXT: global_store_dword v[0:1], v0, off
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: global_atomic_fadd_ret_f32_ieee:
@@ -231,30 +343,46 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr
;
; GFX10-LABEL: global_atomic_fadd_ret_f32_ieee:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b32 s4, exec_lo
+; GFX10-NEXT: s_mov_b32 s3, 0
+; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX10-NEXT: ; implicit-def: $vgpr1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB1_4
+; GFX10-NEXT: ; %bb.1:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s4
+; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-NEXT: s_load_dword s5, s[0:1], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, s2
-; GFX10-NEXT: s_mov_b32 s2, 0
-; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: .LBB1_2: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v2
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
+; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX10-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT: s_cbranch_execnz .LBB1_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT: s_cbranch_execnz .LBB1_2
+; GFX10-NEXT: ; %bb.3: ; %Flow
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT: .LBB1_4: ; %Flow1
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT: global_store_dword v[0:1], v1, off
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10-NEXT: v_mul_f32_e32 v0, 4.0, v0
+; GFX10-NEXT: v_add_f32_e32 v0, s0, v0
+; GFX10-NEXT: global_store_dword v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_atomic_fadd_ret_f32_ieee:
@@ -294,26 +422,36 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr
define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) #0 {
; GFX900-LABEL: global_atomic_fadd_noret_f32:
; GFX900: ; %bb.0:
+; GFX900-NEXT: s_mov_b64 s[2:3], exec
+; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX900-NEXT: s_cbranch_execz .LBB2_3
+; GFX900-NEXT: ; %bb.1:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX900-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
+; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
; GFX900-NEXT: s_mov_b64 s[2:3], 0
-; GFX900-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v1, s4
-; GFX900-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX900-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1
+; GFX900-NEXT: v_add_f32_e32 v0, v1, v2
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX900-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_wbinvl1_vol
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX900-NEXT: v_mov_b32_e32 v1, v0
; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX900-NEXT: s_cbranch_execnz .LBB2_1
-; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT: s_cbranch_execnz .LBB2_2
+; GFX900-NEXT: .LBB2_3:
; GFX900-NEXT: s_endpgm
;
; GFX908-LABEL: global_atomic_fadd_noret_f32:
@@ -360,19 +498,28 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) #
;
; GFX10-LABEL: global_atomic_fadd_noret_f32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b32 s3, exec_lo
+; GFX10-NEXT: s_mov_b32 s2, 0
+; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB2_3
+; GFX10-NEXT: ; %bb.1:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, s2
-; GFX10-NEXT: s_mov_b32 s2, 0
-; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX10-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1
+; GFX10-NEXT: v_add_f32_e32 v0, v1, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX10-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: buffer_gl1_inv
@@ -380,8 +527,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) #
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT: s_cbranch_execnz .LBB2_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_cbranch_execnz .LBB2_2
+; GFX10-NEXT: .LBB2_3:
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_atomic_fadd_noret_f32:
@@ -411,26 +558,36 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) #
define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %ptr) #2 {
; GFX900-LABEL: global_atomic_fadd_noret_f32_ieee:
; GFX900: ; %bb.0:
+; GFX900-NEXT: s_mov_b64 s[2:3], exec
+; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX900-NEXT: s_cbranch_execz .LBB3_3
+; GFX900-NEXT: ; %bb.1:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX900-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
+; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
; GFX900-NEXT: s_mov_b64 s[2:3], 0
-; GFX900-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v1, s4
-; GFX900-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX900-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1
+; GFX900-NEXT: v_add_f32_e32 v0, v1, v2
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX900-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_wbinvl1_vol
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX900-NEXT: v_mov_b32_e32 v1, v0
; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX900-NEXT: s_cbranch_execnz .LBB3_1
-; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT: s_cbranch_execnz .LBB3_2
+; GFX900-NEXT: .LBB3_3:
; GFX900-NEXT: s_endpgm
;
; GFX908-LABEL: global_atomic_fadd_noret_f32_ieee:
@@ -477,19 +634,28 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p
;
; GFX10-LABEL: global_atomic_fadd_noret_f32_ieee:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b32 s3, exec_lo
+; GFX10-NEXT: s_mov_b32 s2, 0
+; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB3_3
+; GFX10-NEXT: ; %bb.1:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, s2
-; GFX10-NEXT: s_mov_b32 s2, 0
-; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX10-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1
+; GFX10-NEXT: v_add_f32_e32 v0, v1, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX10-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: buffer_gl1_inv
@@ -497,8 +663,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT: s_cbranch_execnz .LBB3_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_cbranch_execnz .LBB3_2
+; GFX10-NEXT: .LBB3_3:
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_atomic_fadd_noret_f32_ieee:
@@ -528,54 +694,86 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p
define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %ptr) #0 {
; GFX900-LABEL: global_atomic_fadd_ret_f32_agent:
; GFX900: ; %bb.0:
+; GFX900-NEXT: s_mov_b64 s[4:5], exec
+; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT: ; implicit-def: $vgpr1
+; GFX900-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX900-NEXT: s_cbranch_execz .LBB4_4
+; GFX900-NEXT: ; %bb.1:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX900-NEXT: s_mov_b64 s[2:3], 0
-; GFX900-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
+; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
+; GFX900-NEXT: s_mov_b64 s[4:5], 0
+; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: v_mov_b32_e32 v1, s4
-; GFX900-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX900-NEXT: v_mov_b32_e32 v1, s6
+; GFX900-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT: v_mov_b32_e32 v2, v1
-; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v2
+; GFX900-NEXT: v_mov_b32_e32 v5, v1
+; GFX900-NEXT: v_add_f32_e32 v4, v5, v2
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX900-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_wbinvl1_vol
-; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX900-NEXT: s_cbranch_execnz .LBB4_1
-; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT: s_cbranch_execnz .LBB4_2
+; GFX900-NEXT: ; %bb.3: ; %Flow
+; GFX900-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT: .LBB4_4: ; %Flow1
; GFX900-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX900-NEXT: global_store_dword v[0:1], v1, off
+; GFX900-NEXT: v_readfirstlane_b32 s0, v1
+; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX900-NEXT: v_mad_f32 v0, v0, 4.0, s0
+; GFX900-NEXT: global_store_dword v[0:1], v0, off
; GFX900-NEXT: s_endpgm
;
; GFX908-LABEL: global_atomic_fadd_ret_f32_agent:
; GFX908: ; %bb.0:
+; GFX908-NEXT: s_mov_b64 s[4:5], exec
+; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX908-NEXT: ; implicit-def: $vgpr1
+; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX908-NEXT: s_cbranch_execz .LBB4_4
+; GFX908-NEXT: ; %bb.1:
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX908-NEXT: s_mov_b64 s[2:3], 0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0
+; GFX908-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
+; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mul_f32_e32 v2, 4.0, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX908-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX908-NEXT: v_mov_b32_e32 v3, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, s4
-; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX908-NEXT: v_mov_b32_e32 v1, s6
+; GFX908-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v2
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX908-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1_vol
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX908-NEXT: s_cbranch_execnz .LBB4_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB4_2
+; GFX908-NEXT: ; %bb.3: ; %Flow
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: .LBB4_4: ; %Flow1
; GFX908-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX908-NEXT: global_store_dword v[0:1], v1, off
+; GFX908-NEXT: v_readfirstlane_b32 s0, v1
+; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX908-NEXT: v_mad_f32 v0, v0, 4.0, s0
+; GFX908-NEXT: global_store_dword v[0:1], v0, off
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: global_atomic_fadd_ret_f32_agent:
@@ -607,30 +805,45 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt
;
; GFX10-LABEL: global_atomic_fadd_ret_f32_agent:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b32 s4, exec_lo
+; GFX10-NEXT: s_mov_b32 s3, 0
+; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX10-NEXT: ; implicit-def: $vgpr1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB4_4
+; GFX10-NEXT: ; %bb.1:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s4
+; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-NEXT: s_load_dword s5, s[0:1], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, s2
-; GFX10-NEXT: s_mov_b32 s2, 0
-; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v2
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
+; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX10-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT: s_cbranch_execnz .LBB4_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT: s_cbranch_execnz .LBB4_2
+; GFX10-NEXT: ; %bb.3: ; %Flow
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT: .LBB4_4: ; %Flow1
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT: global_store_dword v[0:1], v1, off
+; GFX10-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX10-NEXT: v_mad_f32 v0, v0, 4.0, s0
+; GFX10-NEXT: global_store_dword v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_atomic_fadd_ret_f32_agent:
@@ -670,138 +883,216 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt
define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %ptr) #0 {
; GFX900-LABEL: global_atomic_fadd_ret_f32_system:
; GFX900: ; %bb.0:
+; GFX900-NEXT: s_mov_b64 s[4:5], exec
+; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT: ; implicit-def: $vgpr1
+; GFX900-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX900-NEXT: s_cbranch_execz .LBB5_4
+; GFX900-NEXT: ; %bb.1:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX900-NEXT: s_mov_b64 s[2:3], 0
-; GFX900-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
+; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
+; GFX900-NEXT: s_mov_b64 s[4:5], 0
+; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: v_mov_b32_e32 v1, s4
-; GFX900-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX900-NEXT: v_mov_b32_e32 v1, s6
+; GFX900-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT: v_mov_b32_e32 v2, v1
-; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v2
+; GFX900-NEXT: v_mov_b32_e32 v5, v1
+; GFX900-NEXT: v_add_f32_e32 v4, v5, v2
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX900-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_wbinvl1_vol
-; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX900-NEXT: s_cbranch_execnz .LBB5_1
-; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT: s_cbranch_execnz .LBB5_2
+; GFX900-NEXT: ; %bb.3: ; %Flow
+; GFX900-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT: .LBB5_4: ; %Flow1
; GFX900-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX900-NEXT: global_store_dword v[0:1], v1, off
+; GFX900-NEXT: v_readfirstlane_b32 s0, v1
+; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX900-NEXT: v_mad_f32 v0, v0, 4.0, s0
+; GFX900-NEXT: global_store_dword v[0:1], v0, off
; GFX900-NEXT: s_endpgm
;
; GFX908-LABEL: global_atomic_fadd_ret_f32_system:
; GFX908: ; %bb.0:
+; GFX908-NEXT: s_mov_b64 s[4:5], exec
+; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX908-NEXT: ; implicit-def: $vgpr1
+; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX908-NEXT: s_cbranch_execz .LBB5_4
+; GFX908-NEXT: ; %bb.1:
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX908-NEXT: s_mov_b64 s[2:3], 0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0
+; GFX908-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
+; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_mul_f32_e32 v2, 4.0, v1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX908-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX908-NEXT: v_mov_b32_e32 v3, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, s4
-; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX908-NEXT: v_mov_b32_e32 v1, s6
+; GFX908-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v2
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX908-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1_vol
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX908-NEXT: s_cbranch_execnz .LBB5_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB5_2
+; GFX908-NEXT: ; %bb.3: ; %Flow
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: .LBB5_4: ; %Flow1
; GFX908-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX908-NEXT: global_store_dword v[0:1], v1, off
+; GFX908-NEXT: v_readfirstlane_b32 s0, v1
+; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX908-NEXT: v_mad_f32 v0, v0, 4.0, s0
+; GFX908-NEXT: global_store_dword v[0:1], v0, off
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: global_atomic_fadd_ret_f32_system:
; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: ; implicit-def: $vgpr1
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB5_4
+; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
+; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_mul_f32_e32 v2, 4.0, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, s4
-; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX90A-NEXT: v_mov_b32_e32 v1, s6
+; GFX90A-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: v_add_f32_e32 v2, 4.0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB5_2
+; GFX90A-NEXT: ; %bb.3: ; %Flow
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: .LBB5_4: ; %Flow1
; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: global_store_dword v[0:1], v1, off
+; GFX90A-NEXT: v_readfirstlane_b32 s0, v1
+; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX90A-NEXT: v_mad_f32 v0, v0, 4.0, s0
+; GFX90A-NEXT: global_store_dword v[0:1], v0, off
; GFX90A-NEXT: s_endpgm
;
; GFX10-LABEL: global_atomic_fadd_ret_f32_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b32 s4, exec_lo
+; GFX10-NEXT: s_mov_b32 s3, 0
+; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX10-NEXT: ; implicit-def: $vgpr1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB5_4
+; GFX10-NEXT: ; %bb.1:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s4
+; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-NEXT: s_load_dword s5, s[0:1], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, s2
-; GFX10-NEXT: s_mov_b32 s2, 0
-; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v2
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
+; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX10-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT: s_cbranch_execnz .LBB5_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT: s_cbranch_execnz .LBB5_2
+; GFX10-NEXT: ; %bb.3: ; %Flow
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT: .LBB5_4: ; %Flow1
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT: global_store_dword v[0:1], v1, off
+; GFX10-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX10-NEXT: v_mad_f32 v0, v0, 4.0, s0
+; GFX10-NEXT: global_store_dword v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_atomic_fadd_ret_f32_system:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 s4, exec_lo
+; GFX11-NEXT: s_mov_b32 s3, 0
+; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: ; implicit-def: $vgpr1
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB5_4
+; GFX11-NEXT: ; %bb.1:
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX11-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s4
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-NEXT: s_mov_b32 s2, 0
-; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX11-NEXT: v_dual_mul_f32 v2, 4.0, v1 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v1
+; GFX11-NEXT: v_add_f32_e32 v4, v5, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v3, v[4:5], s[0:1] glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX11-NEXT: s_cbranch_execnz .LBB5_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX11-NEXT: s_cbranch_execnz .LBB5_2
+; GFX11-NEXT: ; %bb.3: ; %Flow
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX11-NEXT: .LBB5_4: ; %Flow1
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11-NEXT: global_store_b32 v[0:1], v1, off
+; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11-NEXT: v_mul_f32_e32 v0, 4.0, v0
+; GFX11-NEXT: v_add_f32_e32 v0, s0, v0
+; GFX11-NEXT: global_store_b32 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -813,54 +1104,86 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p
define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #1 {
; GCN-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget:
; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b64 s[4:5], exec
+; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: ; implicit-def: $vgpr1
+; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GCN-NEXT: s_cbranch_execz .LBB6_4
+; GCN-NEXT: ; %bb.1:
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b64 s[2:3], 0
-; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
+; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: v_mul_f32_e32 v2, 4.0, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-NEXT: s_load_dword s6, s[0:1], 0x0
+; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v1, s4
-; GCN-NEXT: .LBB6_1: ; %atomicrmw.start
+; GCN-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NEXT: .LBB6_2: ; %atomicrmw.start
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: v_mov_b32_e32 v2, v1
-; GCN-NEXT: v_add_f32_e32 v1, 4.0, v2
+; GCN-NEXT: v_mov_b32_e32 v5, v1
+; GCN-NEXT: v_add_f32_e32 v4, v5, v2
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GCN-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_wbinvl1_vol
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GCN-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GCN-NEXT: s_cbranch_execnz .LBB6_1
-; GCN-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_cbranch_execnz .LBB6_2
+; GCN-NEXT: ; %bb.3: ; %Flow
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: .LBB6_4: ; %Flow1
; GCN-NEXT: s_or_b64 exec, exec, s[2:3]
-; GCN-NEXT: global_store_dword v[0:1], v1, off
+; GCN-NEXT: v_readfirstlane_b32 s0, v1
+; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GCN-NEXT: v_mad_f32 v0, v0, 4.0, s0
+; GCN-NEXT: global_store_dword v[0:1], v0, off
; GCN-NEXT: s_endpgm
;
; GFX11-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b64 s[4:5], exec
+; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX11-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX11-NEXT: ; implicit-def: $vgpr1
+; GFX11-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX11-NEXT: s_cbranch_execz .LBB6_4
+; GFX11-NEXT: ; %bb.1:
; GFX11-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b64 s[2:3], 0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
+; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
+; GFX11-NEXT: s_mov_b64 s[4:5], 0
+; GFX11-NEXT: v_mul_f32_e32 v2, 4.0, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX11-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s4
-; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v1
+; GFX11-NEXT: v_add_f32_e32 v4, v5, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX11-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_wbinvl1_vol
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX11-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX11-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX11-NEXT: s_cbranch_execnz .LBB6_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX11-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX11-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX11-NEXT: s_cbranch_execnz .LBB6_2
+; GFX11-NEXT: ; %bb.3: ; %Flow
+; GFX11-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX11-NEXT: .LBB6_4: ; %Flow1
; GFX11-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11-NEXT: global_store_dword v[0:1], v1, off
+; GFX11-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX11-NEXT: v_mad_f32 v0, v0, 4.0, s0
+; GFX11-NEXT: global_store_dword v[0:1], v0, off
; GFX11-NEXT: s_endpgm
%result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
store float %result, ptr addrspace(1) undef
@@ -916,91 +1239,130 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addr
define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %ptr) {
; GFX900-LABEL: global_atomic_fadd_noret_f32_safe:
; GFX900: ; %bb.0:
+; GFX900-NEXT: s_mov_b64 s[2:3], exec
+; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX900-NEXT: s_cbranch_execz .LBB8_3
+; GFX900-NEXT: ; %bb.1:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX900-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
+; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
; GFX900-NEXT: s_mov_b64 s[2:3], 0
-; GFX900-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v1, s4
-; GFX900-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX900-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1
+; GFX900-NEXT: v_add_f32_e32 v0, v1, v2
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX900-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_wbinvl1_vol
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX900-NEXT: v_mov_b32_e32 v1, v0
; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX900-NEXT: s_cbranch_execnz .LBB8_1
-; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT: s_cbranch_execnz .LBB8_2
+; GFX900-NEXT: .LBB8_3:
; GFX900-NEXT: s_endpgm
;
; GFX908-LABEL: global_atomic_fadd_noret_f32_safe:
; GFX908: ; %bb.0:
+; GFX908-NEXT: s_mov_b64 s[2:3], exec
+; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT: s_cbranch_execz .LBB8_3
+; GFX908-NEXT: ; %bb.1:
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX908-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
+; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
; GFX908-NEXT: s_mov_b64 s[2:3], 0
-; GFX908-NEXT: v_mov_b32_e32 v2, 0
+; GFX908-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX908-NEXT: v_mov_b32_e32 v3, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s4
-; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1
+; GFX908-NEXT: v_add_f32_e32 v0, v1, v2
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX908-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1_vol
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX908-NEXT: s_cbranch_execnz .LBB8_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_cbranch_execnz .LBB8_2
+; GFX908-NEXT: .LBB8_3:
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: global_atomic_fadd_noret_f32_safe:
; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_mov_b64 s[2:3], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB8_3
+; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
+; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s4
-; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1
+; GFX90A-NEXT: v_add_f32_e32 v0, v1, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_cbranch_execnz .LBB8_2
+; GFX90A-NEXT: .LBB8_3:
; GFX90A-NEXT: s_endpgm
;
; GFX10-LABEL: global_atomic_fadd_noret_f32_safe:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b32 s3, exec_lo
+; GFX10-NEXT: s_mov_b32 s2, 0
+; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB8_3
+; GFX10-NEXT: ; %bb.1:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, s2
-; GFX10-NEXT: s_mov_b32 s2, 0
-; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1
+; GFX10-NEXT: v_add_f32_e32 v0, v1, v2
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX10-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: buffer_gl1_inv
@@ -1008,25 +1370,33 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT: s_cbranch_execnz .LBB8_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_cbranch_execnz .LBB8_2
+; GFX10-NEXT: .LBB8_3:
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_atomic_fadd_noret_f32_safe:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 s3, exec_lo
+; GFX11-NEXT: s_mov_b32 s2, 0
+; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX11-NEXT: s_mov_b32 s4, exec_lo
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB8_3
+; GFX11-NEXT: ; %bb.1:
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX11-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-NEXT: s_mov_b32 s2, 0
-; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX11-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_add_f32_e32 v0, 4.0, v1
+; GFX11-NEXT: v_add_f32_e32 v0, v1, v2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: buffer_gl1_inv
@@ -1034,8 +1404,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p
; GFX11-NEXT: v_mov_b32_e32 v1, v0
; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX11-NEXT: s_cbranch_execnz .LBB8_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_cbranch_execnz .LBB8_2
+; GFX11-NEXT: .LBB8_3:
; GFX11-NEXT: s_endpgm
%result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
ret void
@@ -1044,26 +1414,35 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p
define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 {
; GFX900-LABEL: infer_as_before_atomic:
; GFX900: ; %bb.0:
+; GFX900-NEXT: s_mov_b64 s[2:3], exec
+; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX900-NEXT: s_cbranch_execz .LBB9_3
+; GFX900-NEXT: ; %bb.1:
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX900-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
; GFX900-NEXT: s_mov_b64 s[2:3], 0
-; GFX900-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v2, s5
+; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v1, s4
-; GFX900-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX900-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v1
-; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX900-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX900-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX900-NEXT: v_mov_b32_e32 v1, v0
; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX900-NEXT: s_cbranch_execnz .LBB9_1
-; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT: s_cbranch_execnz .LBB9_2
+; GFX900-NEXT: .LBB9_3:
; GFX900-NEXT: s_endpgm
;
; GFX908-LABEL: infer_as_before_atomic:
@@ -1108,26 +1487,34 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 {
;
; GFX10-LABEL: infer_as_before_atomic:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b32 s3, exec_lo
+; GFX10-NEXT: s_mov_b32 s2, 0
+; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB9_3
+; GFX10-NEXT: ; %bb.1:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, s2
-; GFX10-NEXT: s_mov_b32 s2, 0
-; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX10-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_add_f32_e32 v0, 1.0, v1
-; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX10-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX10-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT: s_cbranch_execnz .LBB9_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_cbranch_execnz .LBB9_2
+; GFX10-NEXT: .LBB9_3:
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: infer_as_before_atomic:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 7ba4fb48d16f99..429bdd805ec5e1 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -17,94 +17,133 @@ declare float @div.float.value()
define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3
+; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_add_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2
+; GFX7LESS-NEXT: .LBB0_3:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB0_3
+; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
; GFX9-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB0_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_cbranch_execnz .LBB0_2
+; GFX9-NEXT: .LBB0_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB0_3
+; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB0_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_cbranch_execnz .LBB0_2
+; GFX1064-NEXT: .LBB0_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB0_3
+; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB0_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_cbranch_execnz .LBB0_2
+; GFX1032-NEXT: .LBB0_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
@@ -153,68 +192,97 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX9-DPP-NEXT: ; %bb.1:
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
+; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2
+; GFX9-DPP-NEXT: .LBB0_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2
+; GFX1064-DPP-NEXT: .LBB0_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2
+; GFX1032-DPP-NEXT: .LBB0_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
@@ -324,11 +392,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s11
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_add_u32 s8, s4, 44
+; GFX9-NEXT: s_add_u32 s8, s34, 44
; GFX9-NEXT: s_mov_b32 s13, s9
-; GFX9-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -343,23 +411,47 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX9-NEXT: .LBB1_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b32 s2, s1
+; GFX9-NEXT: s_ff1_i32_b32 s3, s0
+; GFX9-NEXT: s_add_i32 s2, s2, 32
+; GFX9-NEXT: s_min_u32 s2, s3, s2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB1_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB1_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB1_4
+; GFX9-NEXT: .LBB1_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -369,43 +461,67 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s11
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s12, s8
-; GFX1064-NEXT: s_add_u32 s8, s4, 44
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
; GFX1064-NEXT: s_mov_b32 s13, s9
-; GFX1064-NEXT: s_addc_u32 s9, s5, 0
-; GFX1064-NEXT: s_mov_b32 s14, s10
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1064-NEXT: s_getpc_b64 s[6:7]
-; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b32 s14, s10
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b32 s2, s1
+; GFX1064-NEXT: s_ff1_i32_b32 s3, s0
+; GFX1064-NEXT: s_add_i32 s2, s2, 32
+; GFX1064-NEXT: s_min_u32 s2, s3, s2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB1_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX1064-NEXT: .LBB1_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-NEXT: v_mov_b32_e32 v2, v1
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB1_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB1_4
+; GFX1064-NEXT: .LBB1_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -415,43 +531,63 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s11
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s12, s8
-; GFX1032-NEXT: s_add_u32 s8, s4, 44
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
; GFX1032-NEXT: s_mov_b32 s13, s9
-; GFX1032-NEXT: s_addc_u32 s9, s5, 0
-; GFX1032-NEXT: s_mov_b32 s14, s10
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1032-NEXT: s_getpc_b64 s[6:7]
-; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b32 s14, s10
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
+; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB1_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX1032-NEXT: .LBB1_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-NEXT: v_mov_b32_e32 v2, v1
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB1_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB1_4
+; GFX1032-NEXT: .LBB1_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -566,11 +702,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX9-DPP-NEXT: s_mov_b32 s12, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX9-DPP-NEXT: s_mov_b32 s13, s9
-; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[4:5]
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -585,23 +721,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2
+; GFX9-DPP-NEXT: .LBB1_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -611,43 +786,80 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1064-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2
+; GFX1064-DPP-NEXT: .LBB1_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -657,43 +869,74 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1032-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2
+; GFX1032-DPP-NEXT: .LBB1_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
@@ -839,256 +1082,466 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s10, -1
+; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s8, s8, s3
+; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_add_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2
+; GFX7LESS-NEXT: .LBB2_3:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-NEXT: s_add_u32 s8, s8, s3
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB2_3
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: v_cvt_f32_f64_e32 v1, v[1:2]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: v_add_f32_e32 v1, v2, v3
+; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, v1
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB2_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_cbranch_execnz .LBB2_2
+; GFX9-NEXT: .LBB2_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s10, -1
+; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s8, s8, s3
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT: s_addc_u32 s9, s9, 0
+; GFX1064-NEXT: s_mov_b32 s2, 0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB2_3
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mov_b32_e32 v1, s2
; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB2_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_cbranch_execnz .LBB2_2
+; GFX1064-NEXT: .LBB2_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s10, -1
+; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB2_3
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB2_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_cbranch_execnz .LBB2_2
+; GFX1032-NEXT: .LBB2_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-NEXT: s_mov_b32 s2, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT: s_cbranch_execz .LBB2_3
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB2_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_cbranch_execnz .LBB2_2
+; GFX1164-NEXT: .LBB2_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT: s_cbranch_execz .LBB2_3
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s2
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB2_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_cbranch_execnz .LBB2_2
+; GFX1132-NEXT: .LBB2_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s10, -1
+; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-DPP-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1]
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v1, v[1:2]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT: v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v3
+; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2
+; GFX9-DPP-NEXT: .LBB2_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2
+; GFX1064-DPP-NEXT: .LBB2_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2
+; GFX1032-DPP-NEXT: .LBB2_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2
+; GFX1164-DPP-NEXT: .LBB2_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2
+; GFX1132-DPP-NEXT: .LBB2_3:
; GFX1132-DPP-NEXT: s_endpgm
%result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic
ret void
@@ -1155,11 +1608,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s11
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_add_u32 s8, s4, 44
+; GFX9-NEXT: s_add_u32 s8, s34, 44
; GFX9-NEXT: s_mov_b32 s13, s9
-; GFX9-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -1174,23 +1627,47 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b32 s2, s1
+; GFX9-NEXT: s_ff1_i32_b32 s3, s0
+; GFX9-NEXT: s_add_i32 s2, s2, 32
+; GFX9-NEXT: s_min_u32 s2, s3, s2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB3_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB3_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB3_4
+; GFX9-NEXT: .LBB3_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
@@ -1200,43 +1677,67 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s11
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s12, s8
-; GFX1064-NEXT: s_add_u32 s8, s4, 44
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
; GFX1064-NEXT: s_mov_b32 s13, s9
-; GFX1064-NEXT: s_addc_u32 s9, s5, 0
-; GFX1064-NEXT: s_mov_b32 s14, s10
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1064-NEXT: s_getpc_b64 s[6:7]
-; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b32 s14, s10
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b32 s2, s1
+; GFX1064-NEXT: s_ff1_i32_b32 s3, s0
+; GFX1064-NEXT: s_add_i32 s2, s2, 32
+; GFX1064-NEXT: s_min_u32 s2, s3, s2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB3_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX1064-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-NEXT: v_mov_b32_e32 v2, v1
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB3_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB3_4
+; GFX1064-NEXT: .LBB3_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
@@ -1246,94 +1747,142 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s11
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s12, s8
-; GFX1032-NEXT: s_add_u32 s8, s4, 44
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
; GFX1032-NEXT: s_mov_b32 s13, s9
-; GFX1032-NEXT: s_addc_u32 s9, s5, 0
-; GFX1032-NEXT: s_mov_b32 s14, s10
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1032-NEXT: s_getpc_b64 s[6:7]
-; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b32 s14, s10
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
+; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB3_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX1032-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-NEXT: v_mov_b32_e32 v2, v1
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB3_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB3_4
+; GFX1032-NEXT: .LBB3_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1164-NEXT: s_mov_b32 s12, s8
-; GFX1164-NEXT: s_add_u32 s8, s4, 44
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
; GFX1164-NEXT: s_mov_b32 s13, s9
-; GFX1164-NEXT: s_addc_u32 s9, s5, 0
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1164-NEXT: s_getpc_b64 s[6:7]
-; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b32 s2, s1
+; GFX1164-NEXT: s_ctz_i32_b32 s3, s0
+; GFX1164-NEXT: s_add_i32 s2, s2, 32
+; GFX1164-NEXT: s_min_u32 s2, s3, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB3_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1]
+; GFX1164-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-NEXT: v_mov_b32_e32 v2, v1
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB3_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB3_4
+; GFX1164-NEXT: .LBB3_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s4, 44
-; GFX1132-NEXT: s_addc_u32 s9, s5, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1132-NEXT: s_getpc_b64 s[6:7]
-; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1132-NEXT: s_mov_b32 s13, s14
@@ -1341,21 +1890,46 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
+; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB3_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1]
+; GFX1132-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB3_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB3_4
+; GFX1132-NEXT: .LBB3_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
@@ -1366,11 +1940,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX9-DPP-NEXT: s_mov_b32 s12, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX9-DPP-NEXT: s_mov_b32 s13, s9
-; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[4:5]
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -1385,23 +1959,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2
+; GFX9-DPP-NEXT: .LBB3_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
@@ -1411,140 +2024,254 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1064-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1064-DPP-NEXT: s_endpgm
-;
-; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2
+; GFX1064-DPP-NEXT: .LBB3_3:
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1032-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2
+; GFX1032-DPP-NEXT: .LBB3_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
+; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT: v_add_f32_e32 v4, v5, v0
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2
+; GFX1164-DPP-NEXT: .LBB3_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
@@ -1552,21 +2279,60 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
+; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f32_e32 v4, v5, v0
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2
+; GFX1132-DPP-NEXT: .LBB3_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic
@@ -1576,256 +2342,466 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{
; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s10, -1
+; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s8, s8, s3
+; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_add_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2
+; GFX7LESS-NEXT: .LBB4_3:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-NEXT: s_add_u32 s8, s8, s3
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB4_3
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: v_cvt_f32_f64_e32 v1, v[1:2]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: v_add_f32_e32 v1, v2, v3
+; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, v1
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB4_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_cbranch_execnz .LBB4_2
+; GFX9-NEXT: .LBB4_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s10, -1
+; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s8, s8, s3
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT: s_addc_u32 s9, s9, 0
+; GFX1064-NEXT: s_mov_b32 s2, 0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB4_3
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mov_b32_e32 v1, s2
; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB4_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_cbranch_execnz .LBB4_2
+; GFX1064-NEXT: .LBB4_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s10, -1
+; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB4_3
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB4_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_cbranch_execnz .LBB4_2
+; GFX1032-NEXT: .LBB4_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-NEXT: s_mov_b32 s2, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT: s_cbranch_execz .LBB4_3
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB4_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_cbranch_execnz .LBB4_2
+; GFX1164-NEXT: .LBB4_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT: s_cbranch_execz .LBB4_3
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s2
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB4_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_cbranch_execnz .LBB4_2
+; GFX1132-NEXT: .LBB4_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s10, -1
+; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-DPP-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1]
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v1, v[1:2]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT: v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v3
+; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2
+; GFX9-DPP-NEXT: .LBB4_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2
+; GFX1064-DPP-NEXT: .LBB4_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2
+; GFX1032-DPP-NEXT: .LBB4_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2
+; GFX1164-DPP-NEXT: .LBB4_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2
+; GFX1132-DPP-NEXT: .LBB4_3:
; GFX1132-DPP-NEXT: s_endpgm
%result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic
ret void
@@ -1892,11 +2868,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s11
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_add_u32 s8, s4, 44
+; GFX9-NEXT: s_add_u32 s8, s34, 44
; GFX9-NEXT: s_mov_b32 s13, s9
-; GFX9-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -1911,23 +2887,47 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b32 s2, s1
+; GFX9-NEXT: s_ff1_i32_b32 s3, s0
+; GFX9-NEXT: s_add_i32 s2, s2, 32
+; GFX9-NEXT: s_min_u32 s2, s3, s2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB5_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB5_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB5_4
+; GFX9-NEXT: .LBB5_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -1937,43 +2937,67 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s11
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s12, s8
-; GFX1064-NEXT: s_add_u32 s8, s4, 44
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
; GFX1064-NEXT: s_mov_b32 s13, s9
-; GFX1064-NEXT: s_addc_u32 s9, s5, 0
-; GFX1064-NEXT: s_mov_b32 s14, s10
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1064-NEXT: s_getpc_b64 s[6:7]
-; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b32 s14, s10
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b32 s2, s1
+; GFX1064-NEXT: s_ff1_i32_b32 s3, s0
+; GFX1064-NEXT: s_add_i32 s2, s2, 32
+; GFX1064-NEXT: s_min_u32 s2, s3, s2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB5_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX1064-NEXT: .LBB5_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-NEXT: v_mov_b32_e32 v2, v1
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB5_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB5_4
+; GFX1064-NEXT: .LBB5_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -1983,43 +3007,63 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s11
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s12, s8
-; GFX1032-NEXT: s_add_u32 s8, s4, 44
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
; GFX1032-NEXT: s_mov_b32 s13, s9
-; GFX1032-NEXT: s_addc_u32 s9, s5, 0
-; GFX1032-NEXT: s_mov_b32 s14, s10
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1032-NEXT: s_getpc_b64 s[6:7]
-; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b32 s14, s10
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
+; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB5_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX1032-NEXT: .LBB5_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-NEXT: v_mov_b32_e32 v2, v1
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB5_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB5_4
+; GFX1032-NEXT: .LBB5_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -2134,11 +3178,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX9-DPP-NEXT: s_mov_b32 s12, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX9-DPP-NEXT: s_mov_b32 s13, s9
-; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[4:5]
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -2153,23 +3197,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2
+; GFX9-DPP-NEXT: .LBB5_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -2179,43 +3262,80 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1064-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2
+; GFX1064-DPP-NEXT: .LBB5_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -2225,43 +3345,74 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1032-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2
+; GFX1032-DPP-NEXT: .LBB5_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
@@ -2465,11 +3616,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s11
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_add_u32 s8, s4, 44
+; GFX9-NEXT: s_add_u32 s8, s34, 44
; GFX9-NEXT: s_mov_b32 s13, s9
-; GFX9-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -2484,23 +3635,47 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX9-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b32 s2, s1
+; GFX9-NEXT: s_ff1_i32_b32 s3, s0
+; GFX9-NEXT: s_add_i32 s2, s2, 32
+; GFX9-NEXT: s_min_u32 s2, s3, s2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB6_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX9-NEXT: .LBB6_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB6_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB6_4
+; GFX9-NEXT: .LBB6_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2510,43 +3685,67 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s11
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s12, s8
-; GFX1064-NEXT: s_add_u32 s8, s4, 44
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
; GFX1064-NEXT: s_mov_b32 s13, s9
-; GFX1064-NEXT: s_addc_u32 s9, s5, 0
-; GFX1064-NEXT: s_mov_b32 s14, s10
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1064-NEXT: s_getpc_b64 s[6:7]
-; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b32 s14, s10
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b32 s2, s1
+; GFX1064-NEXT: s_ff1_i32_b32 s3, s0
+; GFX1064-NEXT: s_add_i32 s2, s2, 32
+; GFX1064-NEXT: s_min_u32 s2, s3, s2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB6_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX1064-NEXT: .LBB6_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-NEXT: v_mov_b32_e32 v2, v1
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB6_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB6_4
+; GFX1064-NEXT: .LBB6_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2556,43 +3755,63 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s11
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s12, s8
-; GFX1032-NEXT: s_add_u32 s8, s4, 44
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
; GFX1032-NEXT: s_mov_b32 s13, s9
-; GFX1032-NEXT: s_addc_u32 s9, s5, 0
-; GFX1032-NEXT: s_mov_b32 s14, s10
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1032-NEXT: s_getpc_b64 s[6:7]
-; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b32 s14, s10
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
+; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB6_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX1032-NEXT: .LBB6_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-NEXT: v_mov_b32_e32 v2, v1
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB6_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB6_4
+; GFX1032-NEXT: .LBB6_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2707,11 +3926,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX9-DPP-NEXT: s_mov_b32 s12, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX9-DPP-NEXT: s_mov_b32 s13, s9
-; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[4:5]
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -2726,23 +3945,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2
+; GFX9-DPP-NEXT: .LBB6_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2752,43 +4010,80 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1064-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1064-DPP-NEXT: .LBB6_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2798,43 +4093,74 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1032-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1032-DPP-NEXT: .LBB6_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2980,256 +4306,466 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 {
; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s10, -1
+; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s8, s8, s3
+; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_add_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_2
+; GFX7LESS-NEXT: .LBB7_3:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-NEXT: s_add_u32 s8, s8, s3
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB7_3
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: v_cvt_f32_f64_e32 v1, v[1:2]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: v_add_f32_e32 v1, v2, v3
+; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, v1
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB7_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_cbranch_execnz .LBB7_2
+; GFX9-NEXT: .LBB7_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s10, -1
+; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s8, s8, s3
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT: s_addc_u32 s9, s9, 0
+; GFX1064-NEXT: s_mov_b32 s2, 0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB7_3
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mov_b32_e32 v1, s2
; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1064-NEXT: .LBB7_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s10, -1
+; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB7_3
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1032-NEXT: .LBB7_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-NEXT: s_mov_b32 s2, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT: s_cbranch_execz .LBB7_3
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1164-NEXT: .LBB7_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT: s_cbranch_execz .LBB7_3
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s2
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1132-NEXT: .LBB7_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s10, -1
+; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-DPP-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1]
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v1, v[1:2]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT: v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v3
+; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX9-DPP-NEXT: .LBB7_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1064-DPP-NEXT: .LBB7_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1032-DPP-NEXT: .LBB7_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1164-DPP-NEXT: .LBB7_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1132-DPP-NEXT: .LBB7_3:
; GFX1132-DPP-NEXT: s_endpgm
%result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4
ret void
@@ -3295,11 +4831,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s11
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_add_u32 s8, s4, 44
+; GFX9-NEXT: s_add_u32 s8, s34, 44
; GFX9-NEXT: s_mov_b32 s13, s9
-; GFX9-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -3314,23 +4850,47 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX9-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b32 s2, s1
+; GFX9-NEXT: s_ff1_i32_b32 s3, s0
+; GFX9-NEXT: s_add_i32 s2, s2, 32
+; GFX9-NEXT: s_min_u32 s2, s3, s2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB8_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX9-NEXT: .LBB8_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB8_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB8_4
+; GFX9-NEXT: .LBB8_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
@@ -3340,43 +4900,67 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s11
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s12, s8
-; GFX1064-NEXT: s_add_u32 s8, s4, 44
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
; GFX1064-NEXT: s_mov_b32 s13, s9
-; GFX1064-NEXT: s_addc_u32 s9, s5, 0
-; GFX1064-NEXT: s_mov_b32 s14, s10
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1064-NEXT: s_getpc_b64 s[6:7]
-; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b32 s14, s10
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b32 s2, s1
+; GFX1064-NEXT: s_ff1_i32_b32 s3, s0
+; GFX1064-NEXT: s_add_i32 s2, s2, 32
+; GFX1064-NEXT: s_min_u32 s2, s3, s2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB8_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX1064-NEXT: .LBB8_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-NEXT: v_mov_b32_e32 v2, v1
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB8_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB8_4
+; GFX1064-NEXT: .LBB8_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
@@ -3386,94 +4970,142 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s11
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s12, s8
-; GFX1032-NEXT: s_add_u32 s8, s4, 44
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
; GFX1032-NEXT: s_mov_b32 s13, s9
-; GFX1032-NEXT: s_addc_u32 s9, s5, 0
-; GFX1032-NEXT: s_mov_b32 s14, s10
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1032-NEXT: s_getpc_b64 s[6:7]
-; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b32 s14, s10
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
+; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB8_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX1032-NEXT: .LBB8_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-NEXT: v_mov_b32_e32 v2, v1
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB8_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB8_4
+; GFX1032-NEXT: .LBB8_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1164-NEXT: s_mov_b32 s12, s8
-; GFX1164-NEXT: s_add_u32 s8, s4, 44
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
; GFX1164-NEXT: s_mov_b32 s13, s9
-; GFX1164-NEXT: s_addc_u32 s9, s5, 0
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1164-NEXT: s_getpc_b64 s[6:7]
-; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b32 s2, s1
+; GFX1164-NEXT: s_ctz_i32_b32 s3, s0
+; GFX1164-NEXT: s_add_i32 s2, s2, 32
+; GFX1164-NEXT: s_min_u32 s2, s3, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB8_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1]
+; GFX1164-NEXT: .LBB8_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-NEXT: v_mov_b32_e32 v2, v1
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB8_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB8_4
+; GFX1164-NEXT: .LBB8_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s4, 44
-; GFX1132-NEXT: s_addc_u32 s9, s5, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1132-NEXT: s_getpc_b64 s[6:7]
-; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1132-NEXT: s_mov_b32 s13, s14
@@ -3481,21 +5113,46 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
+; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB8_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1]
+; GFX1132-NEXT: .LBB8_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB8_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB8_4
+; GFX1132-NEXT: .LBB8_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
@@ -3506,11 +5163,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX9-DPP-NEXT: s_mov_b32 s12, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX9-DPP-NEXT: s_mov_b32 s13, s9
-; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[4:5]
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -3525,23 +5182,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX9-DPP-NEXT: .LBB8_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
@@ -3551,43 +5247,80 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1064-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1064-DPP-NEXT: .LBB8_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
@@ -3597,94 +5330,171 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1032-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1]
+; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1032-DPP-NEXT: .LBB8_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
+; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT: v_add_f32_e32 v4, v5, v0
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1164-DPP-NEXT: .LBB8_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
@@ -3692,21 +5502,60 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
+; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT: v_add_f32_e32 v4, v5, v0
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1132-DPP-NEXT: .LBB8_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue monotonic, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index ffae7980a9d498..55a3b877181fe9 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -17,256 +17,368 @@ declare float @div.float.value()
define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3
+; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_add_f32_e32 v0, -4.0, v1
+; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2
+; GFX7LESS-NEXT: .LBB0_3:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB0_3
+; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
; GFX9-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_add_f32_e32 v0, -4.0, v1
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB0_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_cbranch_execnz .LBB0_2
+; GFX9-NEXT: .LBB0_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB0_3
+; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_add_f32_e32 v0, -4.0, v1
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB0_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_cbranch_execnz .LBB0_2
+; GFX1064-NEXT: .LBB0_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB0_3
+; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_add_f32_e32 v0, -4.0, v1
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB0_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_cbranch_execnz .LBB0_2
+; GFX1032-NEXT: .LBB0_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_cbranch_execz .LBB0_3
+; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX1164-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_f32_e32 v0, -4.0, v1
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB0_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_cbranch_execnz .LBB0_2
+; GFX1164-NEXT: .LBB0_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_cbranch_execz .LBB0_3
+; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s2
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_add_f32_e32 v0, -4.0, v1
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB0_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_cbranch_execnz .LBB0_2
+; GFX1132-NEXT: .LBB0_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX9-DPP-NEXT: ; %bb.1:
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
+; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2
+; GFX9-DPP-NEXT: .LBB0_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2
+; GFX1064-DPP-NEXT: .LBB0_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2
+; GFX1032-DPP-NEXT: .LBB0_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2
+; GFX1164-DPP-NEXT: .LBB0_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3
+; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2
+; GFX1132-DPP-NEXT: .LBB0_3:
; GFX1132-DPP-NEXT: s_endpgm
%result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4
ret void
@@ -332,11 +444,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s11
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_add_u32 s8, s4, 44
+; GFX9-NEXT: s_add_u32 s8, s34, 44
; GFX9-NEXT: s_mov_b32 s13, s9
-; GFX9-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -351,23 +463,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: .LBB1_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b32 s2, s1
+; GFX9-NEXT: s_ff1_i32_b32 s3, s0
+; GFX9-NEXT: s_add_i32 s2, s2, 32
+; GFX9-NEXT: s_min_u32 s2, s3, s2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_subrev_f32_e32 v2, s4, v2
+; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB1_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB1_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB1_4
+; GFX9-NEXT: .LBB1_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
@@ -377,43 +513,67 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s11
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s12, s8
-; GFX1064-NEXT: s_add_u32 s8, s4, 44
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
; GFX1064-NEXT: s_mov_b32 s13, s9
-; GFX1064-NEXT: s_addc_u32 s9, s5, 0
-; GFX1064-NEXT: s_mov_b32 s14, s10
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1064-NEXT: s_getpc_b64 s[6:7]
-; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b32 s14, s10
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b32 s2, s1
+; GFX1064-NEXT: s_ff1_i32_b32 s3, s0
+; GFX1064-NEXT: s_add_i32 s2, s2, 32
+; GFX1064-NEXT: s_min_u32 s2, s3, s2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_subrev_f32_e32 v2, s4, v2
+; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB1_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX1064-NEXT: .LBB1_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-NEXT: v_mov_b32_e32 v2, v1
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB1_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB1_4
+; GFX1064-NEXT: .LBB1_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
@@ -423,94 +583,142 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s11
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s12, s8
-; GFX1032-NEXT: s_add_u32 s8, s4, 44
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
; GFX1032-NEXT: s_mov_b32 s13, s9
-; GFX1032-NEXT: s_addc_u32 s9, s5, 0
-; GFX1032-NEXT: s_mov_b32 s14, s10
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1032-NEXT: s_getpc_b64 s[6:7]
-; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b32 s14, s10
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_subrev_f32_e32 v2, s2, v2
+; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB1_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX1032-NEXT: .LBB1_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-NEXT: v_mov_b32_e32 v2, v1
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB1_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB1_4
+; GFX1032-NEXT: .LBB1_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1164-NEXT: s_mov_b32 s12, s8
-; GFX1164-NEXT: s_add_u32 s8, s4, 44
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
; GFX1164-NEXT: s_mov_b32 s13, s9
-; GFX1164-NEXT: s_addc_u32 s9, s5, 0
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1164-NEXT: s_getpc_b64 s[6:7]
-; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX1164-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b32 s2, s1
+; GFX1164-NEXT: s_ctz_i32_b32 s3, s0
+; GFX1164-NEXT: s_add_i32 s2, s2, 32
+; GFX1164-NEXT: s_min_u32 s2, s3, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_subrev_f32_e32 v2, s4, v2
+; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB1_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1]
+; GFX1164-NEXT: .LBB1_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-NEXT: v_mov_b32_e32 v2, v1
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB1_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB1_4
+; GFX1164-NEXT: .LBB1_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s4, 44
-; GFX1132-NEXT: s_addc_u32 s9, s5, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1132-NEXT: s_getpc_b64 s[6:7]
-; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1132-NEXT: s_mov_b32 s13, s14
@@ -518,21 +726,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX1132-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_subrev_f32_e32 v2, s2, v2
+; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB1_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1]
+; GFX1132-NEXT: .LBB1_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB1_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB1_4
+; GFX1132-NEXT: .LBB1_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
@@ -543,11 +776,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX9-DPP-NEXT: s_mov_b32 s12, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX9-DPP-NEXT: s_mov_b32 s13, s9
-; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[4:5]
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -562,23 +795,53 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, s4, v1
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2
+; GFX9-DPP-NEXT: .LBB1_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
@@ -588,43 +851,70 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1064-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_sub_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v5
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2
+; GFX1064-DPP-NEXT: .LBB1_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
@@ -634,94 +924,148 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1032-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v5
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2
+; GFX1032-DPP-NEXT: .LBB1_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v3, v4, v5
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2
+; GFX1164-DPP-NEXT: .LBB1_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
@@ -729,21 +1073,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT: v_sub_f32_e32 v3, v4, v5
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2
+; GFX1132-DPP-NEXT: .LBB1_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
%result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4
@@ -753,256 +1123,466 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s10, -1
+; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s8, s8, s3
+; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_subrev_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2
+; GFX7LESS-NEXT: .LBB2_3:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-NEXT: s_add_u32 s8, s8, s3
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB2_3
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: v_cvt_f32_f64_e32 v1, v[1:2]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: v_sub_f32_e32 v1, v2, v3
+; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, v1
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB2_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_cbranch_execnz .LBB2_2
+; GFX9-NEXT: .LBB2_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s10, -1
+; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s8, s8, s3
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT: s_addc_u32 s9, s9, 0
+; GFX1064-NEXT: s_mov_b32 s2, 0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB2_3
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mov_b32_e32 v1, s2
; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB2_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_cbranch_execnz .LBB2_2
+; GFX1064-NEXT: .LBB2_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s10, -1
+; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB2_3
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB2_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_cbranch_execnz .LBB2_2
+; GFX1032-NEXT: .LBB2_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-NEXT: s_mov_b32 s2, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT: s_cbranch_execz .LBB2_3
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB2_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_cbranch_execnz .LBB2_2
+; GFX1164-NEXT: .LBB2_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT: s_cbranch_execz .LBB2_3
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s2
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB2_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_cbranch_execnz .LBB2_2
+; GFX1132-NEXT: .LBB2_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s10, -1
+; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-DPP-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1]
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v1, v[1:2]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT: v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v3
+; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2
+; GFX9-DPP-NEXT: .LBB2_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2
+; GFX1064-DPP-NEXT: .LBB2_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2
+; GFX1032-DPP-NEXT: .LBB2_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2
+; GFX1164-DPP-NEXT: .LBB2_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2
+; GFX1132-DPP-NEXT: .LBB2_3:
; GFX1132-DPP-NEXT: s_endpgm
%result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic
ret void
@@ -1069,11 +1649,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s11
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_add_u32 s8, s4, 44
+; GFX9-NEXT: s_add_u32 s8, s34, 44
; GFX9-NEXT: s_mov_b32 s13, s9
-; GFX9-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -1088,23 +1668,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b32 s2, s1
+; GFX9-NEXT: s_ff1_i32_b32 s3, s0
+; GFX9-NEXT: s_add_i32 s2, s2, 32
+; GFX9-NEXT: s_min_u32 s2, s3, s2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_subrev_f32_e32 v2, s4, v2
+; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB3_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB3_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB3_4
+; GFX9-NEXT: .LBB3_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
@@ -1114,43 +1718,67 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s11
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s12, s8
-; GFX1064-NEXT: s_add_u32 s8, s4, 44
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
; GFX1064-NEXT: s_mov_b32 s13, s9
-; GFX1064-NEXT: s_addc_u32 s9, s5, 0
-; GFX1064-NEXT: s_mov_b32 s14, s10
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1064-NEXT: s_getpc_b64 s[6:7]
-; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b32 s14, s10
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b32 s2, s1
+; GFX1064-NEXT: s_ff1_i32_b32 s3, s0
+; GFX1064-NEXT: s_add_i32 s2, s2, 32
+; GFX1064-NEXT: s_min_u32 s2, s3, s2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_subrev_f32_e32 v2, s4, v2
+; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB3_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX1064-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-NEXT: v_mov_b32_e32 v2, v1
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB3_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB3_4
+; GFX1064-NEXT: .LBB3_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
@@ -1160,94 +1788,142 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s11
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s12, s8
-; GFX1032-NEXT: s_add_u32 s8, s4, 44
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
; GFX1032-NEXT: s_mov_b32 s13, s9
-; GFX1032-NEXT: s_addc_u32 s9, s5, 0
-; GFX1032-NEXT: s_mov_b32 s14, s10
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1032-NEXT: s_getpc_b64 s[6:7]
-; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b32 s14, s10
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_subrev_f32_e32 v2, s2, v2
+; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB3_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX1032-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-NEXT: v_mov_b32_e32 v2, v1
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB3_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB3_4
+; GFX1032-NEXT: .LBB3_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1164-NEXT: s_mov_b32 s12, s8
-; GFX1164-NEXT: s_add_u32 s8, s4, 44
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
; GFX1164-NEXT: s_mov_b32 s13, s9
-; GFX1164-NEXT: s_addc_u32 s9, s5, 0
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1164-NEXT: s_getpc_b64 s[6:7]
-; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX1164-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b32 s2, s1
+; GFX1164-NEXT: s_ctz_i32_b32 s3, s0
+; GFX1164-NEXT: s_add_i32 s2, s2, 32
+; GFX1164-NEXT: s_min_u32 s2, s3, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_subrev_f32_e32 v2, s4, v2
+; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB3_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1]
+; GFX1164-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-NEXT: v_mov_b32_e32 v2, v1
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB3_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB3_4
+; GFX1164-NEXT: .LBB3_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s4, 44
-; GFX1132-NEXT: s_addc_u32 s9, s5, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1132-NEXT: s_getpc_b64 s[6:7]
-; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1132-NEXT: s_mov_b32 s13, s14
@@ -1255,21 +1931,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX1132-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_subrev_f32_e32 v2, s2, v2
+; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB3_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1]
+; GFX1132-NEXT: .LBB3_4: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB3_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB3_4
+; GFX1132-NEXT: .LBB3_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
@@ -1280,11 +1981,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX9-DPP-NEXT: s_mov_b32 s12, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX9-DPP-NEXT: s_mov_b32 s13, s9
-; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[4:5]
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -1299,23 +2000,53 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, s4, v1
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2
+; GFX9-DPP-NEXT: .LBB3_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
@@ -1325,43 +2056,70 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1064-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_sub_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v5
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2
+; GFX1064-DPP-NEXT: .LBB3_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
@@ -1371,94 +2129,148 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1032-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v5
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2
+; GFX1032-DPP-NEXT: .LBB3_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v3, v4, v5
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2
+; GFX1164-DPP-NEXT: .LBB3_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
@@ -1466,21 +2278,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT: v_sub_f32_e32 v3, v4, v5
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2
+; GFX1132-DPP-NEXT: .LBB3_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
%result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic
@@ -1490,256 +2328,466 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{
; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s10, -1
+; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s8, s8, s3
+; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_subrev_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2
+; GFX7LESS-NEXT: .LBB4_3:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-NEXT: s_add_u32 s8, s8, s3
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB4_3
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: v_cvt_f32_f64_e32 v1, v[1:2]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: v_sub_f32_e32 v1, v2, v3
+; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, v1
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB4_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_cbranch_execnz .LBB4_2
+; GFX9-NEXT: .LBB4_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s10, -1
+; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s8, s8, s3
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT: s_addc_u32 s9, s9, 0
+; GFX1064-NEXT: s_mov_b32 s2, 0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB4_3
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mov_b32_e32 v1, s2
; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB4_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_cbranch_execnz .LBB4_2
+; GFX1064-NEXT: .LBB4_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s10, -1
+; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB4_3
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB4_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_cbranch_execnz .LBB4_2
+; GFX1032-NEXT: .LBB4_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-NEXT: s_mov_b32 s2, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT: s_cbranch_execz .LBB4_3
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB4_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_cbranch_execnz .LBB4_2
+; GFX1164-NEXT: .LBB4_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT: s_cbranch_execz .LBB4_3
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s2
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB4_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_cbranch_execnz .LBB4_2
+; GFX1132-NEXT: .LBB4_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s10, -1
+; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-DPP-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1]
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v1, v[1:2]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT: v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v3
+; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2
+; GFX9-DPP-NEXT: .LBB4_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2
+; GFX1064-DPP-NEXT: .LBB4_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2
+; GFX1032-DPP-NEXT: .LBB4_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2
+; GFX1164-DPP-NEXT: .LBB4_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: .LBB4_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2
+; GFX1132-DPP-NEXT: .LBB4_3:
; GFX1132-DPP-NEXT: s_endpgm
%result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic
ret void
@@ -1806,11 +2854,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s11
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_add_u32 s8, s4, 44
+; GFX9-NEXT: s_add_u32 s8, s34, 44
; GFX9-NEXT: s_mov_b32 s13, s9
-; GFX9-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -1825,23 +2873,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b32 s2, s1
+; GFX9-NEXT: s_ff1_i32_b32 s3, s0
+; GFX9-NEXT: s_add_i32 s2, s2, 32
+; GFX9-NEXT: s_min_u32 s2, s3, s2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_subrev_f32_e32 v2, s4, v2
+; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB5_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB5_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB5_4
+; GFX9-NEXT: .LBB5_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
@@ -1851,43 +2923,67 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s11
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s12, s8
-; GFX1064-NEXT: s_add_u32 s8, s4, 44
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
; GFX1064-NEXT: s_mov_b32 s13, s9
-; GFX1064-NEXT: s_addc_u32 s9, s5, 0
-; GFX1064-NEXT: s_mov_b32 s14, s10
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1064-NEXT: s_getpc_b64 s[6:7]
-; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b32 s14, s10
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b32 s2, s1
+; GFX1064-NEXT: s_ff1_i32_b32 s3, s0
+; GFX1064-NEXT: s_add_i32 s2, s2, 32
+; GFX1064-NEXT: s_min_u32 s2, s3, s2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_subrev_f32_e32 v2, s4, v2
+; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB5_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX1064-NEXT: .LBB5_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-NEXT: v_mov_b32_e32 v2, v1
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB5_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB5_4
+; GFX1064-NEXT: .LBB5_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
@@ -1897,94 +2993,142 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s11
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s12, s8
-; GFX1032-NEXT: s_add_u32 s8, s4, 44
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
; GFX1032-NEXT: s_mov_b32 s13, s9
-; GFX1032-NEXT: s_addc_u32 s9, s5, 0
-; GFX1032-NEXT: s_mov_b32 s14, s10
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1032-NEXT: s_getpc_b64 s[6:7]
-; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b32 s14, s10
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_subrev_f32_e32 v2, s2, v2
+; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB5_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX1032-NEXT: .LBB5_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-NEXT: v_mov_b32_e32 v2, v1
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB5_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB5_4
+; GFX1032-NEXT: .LBB5_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1164-NEXT: s_mov_b32 s12, s8
-; GFX1164-NEXT: s_add_u32 s8, s4, 44
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
; GFX1164-NEXT: s_mov_b32 s13, s9
-; GFX1164-NEXT: s_addc_u32 s9, s5, 0
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1164-NEXT: s_getpc_b64 s[6:7]
-; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX1164-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b32 s2, s1
+; GFX1164-NEXT: s_ctz_i32_b32 s3, s0
+; GFX1164-NEXT: s_add_i32 s2, s2, 32
+; GFX1164-NEXT: s_min_u32 s2, s3, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_subrev_f32_e32 v2, s4, v2
+; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB5_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1]
+; GFX1164-NEXT: .LBB5_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-NEXT: v_mov_b32_e32 v2, v1
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB5_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB5_4
+; GFX1164-NEXT: .LBB5_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s4, 44
-; GFX1132-NEXT: s_addc_u32 s9, s5, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1132-NEXT: s_getpc_b64 s[6:7]
-; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1132-NEXT: s_mov_b32 s13, s14
@@ -1992,21 +3136,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX1132-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_subrev_f32_e32 v2, s2, v2
+; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB5_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1]
+; GFX1132-NEXT: .LBB5_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB5_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB5_4
+; GFX1132-NEXT: .LBB5_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
@@ -2017,11 +3186,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX9-DPP-NEXT: s_mov_b32 s12, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX9-DPP-NEXT: s_mov_b32 s13, s9
-; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[4:5]
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -2036,23 +3205,53 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, s4, v1
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2
+; GFX9-DPP-NEXT: .LBB5_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
@@ -2062,43 +3261,70 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1064-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_sub_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v5
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2
+; GFX1064-DPP-NEXT: .LBB5_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
@@ -2108,94 +3334,148 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1032-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v5
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2
+; GFX1032-DPP-NEXT: .LBB5_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v3, v4, v5
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2
+; GFX1164-DPP-NEXT: .LBB5_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
@@ -2203,21 +3483,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT: v_sub_f32_e32 v3, v4, v5
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2
+; GFX1132-DPP-NEXT: .LBB5_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
%result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic
@@ -2285,11 +3591,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s11
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_add_u32 s8, s4, 44
+; GFX9-NEXT: s_add_u32 s8, s34, 44
; GFX9-NEXT: s_mov_b32 s13, s9
-; GFX9-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -2304,23 +3610,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b32 s2, s1
+; GFX9-NEXT: s_ff1_i32_b32 s3, s0
+; GFX9-NEXT: s_add_i32 s2, s2, 32
+; GFX9-NEXT: s_min_u32 s2, s3, s2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_subrev_f32_e32 v2, s4, v2
+; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB6_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX9-NEXT: .LBB6_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB6_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB6_4
+; GFX9-NEXT: .LBB6_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2330,43 +3660,67 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s11
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s12, s8
-; GFX1064-NEXT: s_add_u32 s8, s4, 44
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
; GFX1064-NEXT: s_mov_b32 s13, s9
-; GFX1064-NEXT: s_addc_u32 s9, s5, 0
-; GFX1064-NEXT: s_mov_b32 s14, s10
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1064-NEXT: s_getpc_b64 s[6:7]
-; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b32 s14, s10
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b32 s2, s1
+; GFX1064-NEXT: s_ff1_i32_b32 s3, s0
+; GFX1064-NEXT: s_add_i32 s2, s2, 32
+; GFX1064-NEXT: s_min_u32 s2, s3, s2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_subrev_f32_e32 v2, s4, v2
+; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB6_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX1064-NEXT: .LBB6_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-NEXT: v_mov_b32_e32 v2, v1
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB6_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB6_4
+; GFX1064-NEXT: .LBB6_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2376,94 +3730,142 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s11
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s12, s8
-; GFX1032-NEXT: s_add_u32 s8, s4, 44
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
; GFX1032-NEXT: s_mov_b32 s13, s9
-; GFX1032-NEXT: s_addc_u32 s9, s5, 0
-; GFX1032-NEXT: s_mov_b32 s14, s10
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1032-NEXT: s_getpc_b64 s[6:7]
-; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b32 s14, s10
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_subrev_f32_e32 v2, s2, v2
+; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB6_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX1032-NEXT: .LBB6_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-NEXT: v_mov_b32_e32 v2, v1
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB6_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB6_4
+; GFX1032-NEXT: .LBB6_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1164-NEXT: s_mov_b32 s12, s8
-; GFX1164-NEXT: s_add_u32 s8, s4, 44
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
; GFX1164-NEXT: s_mov_b32 s13, s9
-; GFX1164-NEXT: s_addc_u32 s9, s5, 0
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1164-NEXT: s_getpc_b64 s[6:7]
-; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX1164-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b32 s2, s1
+; GFX1164-NEXT: s_ctz_i32_b32 s3, s0
+; GFX1164-NEXT: s_add_i32 s2, s2, 32
+; GFX1164-NEXT: s_min_u32 s2, s3, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_subrev_f32_e32 v2, s4, v2
+; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB6_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1]
+; GFX1164-NEXT: .LBB6_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-NEXT: v_mov_b32_e32 v2, v1
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB6_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB6_4
+; GFX1164-NEXT: .LBB6_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s4, 44
-; GFX1132-NEXT: s_addc_u32 s9, s5, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1132-NEXT: s_getpc_b64 s[6:7]
-; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1132-NEXT: s_mov_b32 s13, s14
@@ -2471,21 +3873,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX1132-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB6_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_subrev_f32_e32 v2, s2, v2
+; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB6_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1]
+; GFX1132-NEXT: .LBB6_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB6_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB6_4
+; GFX1132-NEXT: .LBB6_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2496,11 +3923,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX9-DPP-NEXT: s_mov_b32 s12, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX9-DPP-NEXT: s_mov_b32 s13, s9
-; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[4:5]
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -2515,23 +3942,53 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, s4, v1
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2
+; GFX9-DPP-NEXT: .LBB6_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2541,43 +3998,70 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1064-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_sub_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v5
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1064-DPP-NEXT: .LBB6_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
@@ -2587,94 +4071,148 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1032-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v5
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1032-DPP-NEXT: .LBB6_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v3, v4, v5
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1164-DPP-NEXT: .LBB6_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
@@ -2682,21 +4220,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT: v_sub_f32_e32 v3, v4, v5
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1132-DPP-NEXT: .LBB6_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
%result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic
@@ -2706,256 +4270,466 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 {
; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s10, -1
+; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s8, s8, s3
+; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_subrev_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1
-; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_2
+; GFX7LESS-NEXT: .LBB7_3:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-NEXT: s_add_u32 s8, s8, s3
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB7_3
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: v_cvt_f32_f64_e32 v1, v[1:2]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: v_sub_f32_e32 v1, v2, v3
+; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, v1
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB7_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_cbranch_execnz .LBB7_2
+; GFX9-NEXT: .LBB7_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s10, -1
+; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s8, s8, s3
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT: s_addc_u32 s9, s9, 0
+; GFX1064-NEXT: s_mov_b32 s2, 0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB7_3
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mov_b32_e32 v1, s2
; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1064-NEXT: .LBB7_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s10, -1
+; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB7_3
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1032-NEXT: .LBB7_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-NEXT: s_mov_b32 s2, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT: s_cbranch_execz .LBB7_3
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1164-NEXT: .LBB7_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT: s_cbranch_execz .LBB7_3
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s2
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1132-NEXT: .LBB7_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s10, -1
+; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-DPP-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1]
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v1, v[1:2]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT: v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v3
+; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX9-DPP-NEXT: .LBB7_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1064-DPP-NEXT: .LBB7_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1032-DPP-NEXT: .LBB7_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1164-DPP-NEXT: .LBB7_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:12
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:8
+; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_mov_b32 s3, 0xc3300000
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1132-DPP-NEXT: .LBB7_3:
; GFX1132-DPP-NEXT: s_endpgm
%result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 monotonic, align 4
ret void
@@ -3021,11 +4795,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s11
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_add_u32 s8, s4, 44
+; GFX9-NEXT: s_add_u32 s8, s34, 44
; GFX9-NEXT: s_mov_b32 s13, s9
-; GFX9-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -3040,23 +4814,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b32 s2, s1
+; GFX9-NEXT: s_ff1_i32_b32 s3, s0
+; GFX9-NEXT: s_add_i32 s2, s2, 32
+; GFX9-NEXT: s_min_u32 s2, s3, s2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_subrev_f32_e32 v2, s4, v2
+; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execz .LBB8_5
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX9-NEXT: .LBB8_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB8_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB8_4
+; GFX9-NEXT: .LBB8_5:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
@@ -3066,43 +4864,67 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
; GFX1064-NEXT: s_mov_b32 s38, -1
; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-NEXT: s_add_u32 s36, s36, s11
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1064-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-NEXT: s_mov_b32 s12, s8
-; GFX1064-NEXT: s_add_u32 s8, s4, 44
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
; GFX1064-NEXT: s_mov_b32 s13, s9
-; GFX1064-NEXT: s_addc_u32 s9, s5, 0
-; GFX1064-NEXT: s_mov_b32 s14, s10
-; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1064-NEXT: s_getpc_b64 s[6:7]
-; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b32 s14, s10
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-NEXT: s_mov_b32 s32, 0
-; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b32 s2, s1
+; GFX1064-NEXT: s_ff1_i32_b32 s3, s0
+; GFX1064-NEXT: s_add_i32 s2, s2, 32
+; GFX1064-NEXT: s_min_u32 s2, s3, s2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_subrev_f32_e32 v2, s4, v2
+; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB8_5
+; GFX1064-NEXT: ; %bb.3:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX1064-NEXT: .LBB8_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-NEXT: v_mov_b32_e32 v2, v1
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB8_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB8_4
+; GFX1064-NEXT: .LBB8_5:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
@@ -3112,94 +4934,142 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
; GFX1032-NEXT: s_mov_b32 s38, -1
; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-NEXT: s_add_u32 s36, s36, s11
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1032-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-NEXT: s_mov_b32 s12, s8
-; GFX1032-NEXT: s_add_u32 s8, s4, 44
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
; GFX1032-NEXT: s_mov_b32 s13, s9
-; GFX1032-NEXT: s_addc_u32 s9, s5, 0
-; GFX1032-NEXT: s_mov_b32 s14, s10
-; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1032-NEXT: s_getpc_b64 s[6:7]
-; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b32 s14, s10
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-NEXT: s_mov_b32 s32, 0
-; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
+; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032-NEXT: v_subrev_f32_e32 v2, s2, v2
+; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execz .LBB8_5
+; GFX1032-NEXT: ; %bb.3:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
+; GFX1032-NEXT: .LBB8_4: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-NEXT: v_mov_b32_e32 v2, v1
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB8_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB8_4
+; GFX1032-NEXT: .LBB8_5:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1164-NEXT: s_mov_b32 s12, s8
-; GFX1164-NEXT: s_add_u32 s8, s4, 44
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
; GFX1164-NEXT: s_mov_b32 s13, s9
-; GFX1164-NEXT: s_addc_u32 s9, s5, 0
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1164-NEXT: s_getpc_b64 s[6:7]
-; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1164-NEXT: s_mov_b32 s32, 0
-; GFX1164-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX1164-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_ctz_i32_b32 s2, s1
+; GFX1164-NEXT: s_ctz_i32_b32 s3, s0
+; GFX1164-NEXT: s_add_i32 s2, s2, 32
+; GFX1164-NEXT: s_min_u32 s2, s3, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_subrev_f32_e32 v2, s4, v2
+; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execz .LBB8_5
+; GFX1164-NEXT: ; %bb.3:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1]
+; GFX1164-NEXT: .LBB8_4: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-NEXT: v_mov_b32_e32 v2, v1
-; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB8_1
-; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB8_4
+; GFX1164-NEXT: .LBB8_5:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_add_u32 s8, s4, 44
-; GFX1132-NEXT: s_addc_u32 s9, s5, 0
-; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1132-NEXT: s_getpc_b64 s[6:7]
-; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1132-NEXT: s_mov_b32 s13, s14
@@ -3207,21 +5077,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX1132-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: .LBB8_1: ; %ComputeLoop
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
+; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_subrev_f32_e32 v2, s2, v2
+; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execz .LBB8_5
+; GFX1132-NEXT: ; %bb.3:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1]
+; GFX1132-NEXT: .LBB8_4: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB8_1
-; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB8_4
+; GFX1132-NEXT: .LBB8_5:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
@@ -3232,11 +5127,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11
; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX9-DPP-NEXT: s_mov_b32 s12, s8
-; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX9-DPP-NEXT: s_mov_b32 s13, s9
-; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX9-DPP-NEXT: s_getpc_b64 s[4:5]
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
@@ -3251,23 +5146,53 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT: s_not_b64 exec, exec
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: s_nop 0
+; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: s_nop 1
+; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
+; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, s4, v1
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_1
-; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX9-DPP-NEXT: .LBB8_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
@@ -3277,43 +5202,70 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1064-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1064-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT: s_not_b64 exec, exec
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1064-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_sub_f32_e64 v3, s2, s3
+; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v5
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1064-DPP-NEXT: .LBB8_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
@@ -3323,94 +5275,148 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
; GFX1032-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1032-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v0
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
+; GFX1032-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v5
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1032-DPP-NEXT: .LBB8_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
; GFX1164-DPP-NEXT: s_mov_b32 s12, s8
-; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
-; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: s_not_b64 exec, exec
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: global_load_b32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT: v_sub_f32_e32 v3, v4, v5
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_1
-; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1164-DPP-NEXT: .LBB8_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44
-; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0
-; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7]
-; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0
-; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
@@ -3418,21 +5424,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35]
-; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_load_b32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT: v_sub_f32_e32 v3, v4, v5
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_1
-; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1132-DPP-NEXT: .LBB8_3:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
%result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue monotonic, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index cf986e7d314a20..84f67b3faac3c0 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -186,6 +186,10 @@
; GCN-O1-NEXT: Cycle Info Analysis
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: Infer address spaces
+; GCN-O1-NEXT: Dominator Tree Construction
+; GCN-O1-NEXT: Cycle Info Analysis
+; GCN-O1-NEXT: Uniformity Analysis
+; GCN-O1-NEXT: AMDGPU atomic optimizations
; GCN-O1-NEXT: Expand Atomic instructions
; GCN-O1-NEXT: AMDGPU Promote Alloca
; GCN-O1-NEXT: Dominator Tree Construction
@@ -241,14 +245,11 @@
; GCN-O1-NEXT: Cycle Info Analysis
; GCN-O1-NEXT: Uniformity Analysis
; GCN-O1-NEXT: AMDGPU IR late optimizations
-; GCN-O1-NEXT: AMDGPU atomic optimizations
; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O1-NEXT: Function Alias Analysis Results
; GCN-O1-NEXT: Natural Loop Information
; GCN-O1-NEXT: Code sinking
; GCN-O1-NEXT: Post-Dominator Tree Construction
-; GCN-O1-NEXT: Cycle Info Analysis
-; GCN-O1-NEXT: Uniformity Analysis
; GCN-O1-NEXT: Unify divergent function exit nodes
; GCN-O1-NEXT: Lazy Value Information Analysis
; GCN-O1-NEXT: Lower SwitchInst's to branches
@@ -457,6 +458,10 @@
; GCN-O1-OPTS-NEXT: Cycle Info Analysis
; GCN-O1-OPTS-NEXT: FunctionPass Manager
; GCN-O1-OPTS-NEXT: Infer address spaces
+; GCN-O1-OPTS-NEXT: Dominator Tree Construction
+; GCN-O1-OPTS-NEXT: Cycle Info Analysis
+; GCN-O1-OPTS-NEXT: Uniformity Analysis
+; GCN-O1-OPTS-NEXT: AMDGPU atomic optimizations
; GCN-O1-OPTS-NEXT: Expand Atomic instructions
; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
@@ -526,14 +531,11 @@
; GCN-O1-OPTS-NEXT: Cycle Info Analysis
; GCN-O1-OPTS-NEXT: Uniformity Analysis
; GCN-O1-OPTS-NEXT: AMDGPU IR late optimizations
-; GCN-O1-OPTS-NEXT: AMDGPU atomic optimizations
; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O1-OPTS-NEXT: Function Alias Analysis Results
; GCN-O1-OPTS-NEXT: Natural Loop Information
; GCN-O1-OPTS-NEXT: Code sinking
; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction
-; GCN-O1-OPTS-NEXT: Cycle Info Analysis
-; GCN-O1-OPTS-NEXT: Uniformity Analysis
; GCN-O1-OPTS-NEXT: Unify divergent function exit nodes
; GCN-O1-OPTS-NEXT: Lazy Value Information Analysis
; GCN-O1-OPTS-NEXT: Lower SwitchInst's to branches
@@ -750,6 +752,10 @@
; GCN-O2-NEXT: Cycle Info Analysis
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: Infer address spaces
+; GCN-O2-NEXT: Dominator Tree Construction
+; GCN-O2-NEXT: Cycle Info Analysis
+; GCN-O2-NEXT: Uniformity Analysis
+; GCN-O2-NEXT: AMDGPU atomic optimizations
; GCN-O2-NEXT: Expand Atomic instructions
; GCN-O2-NEXT: AMDGPU Promote Alloca
; GCN-O2-NEXT: Dominator Tree Construction
@@ -827,14 +833,11 @@
; GCN-O2-NEXT: Cycle Info Analysis
; GCN-O2-NEXT: Uniformity Analysis
; GCN-O2-NEXT: AMDGPU IR late optimizations
-; GCN-O2-NEXT: AMDGPU atomic optimizations
; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O2-NEXT: Function Alias Analysis Results
; GCN-O2-NEXT: Natural Loop Information
; GCN-O2-NEXT: Code sinking
; GCN-O2-NEXT: Post-Dominator Tree Construction
-; GCN-O2-NEXT: Cycle Info Analysis
-; GCN-O2-NEXT: Uniformity Analysis
; GCN-O2-NEXT: Unify divergent function exit nodes
; GCN-O2-NEXT: Lazy Value Information Analysis
; GCN-O2-NEXT: Lower SwitchInst's to branches
@@ -1053,6 +1056,10 @@
; GCN-O3-NEXT: Cycle Info Analysis
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: Infer address spaces
+; GCN-O3-NEXT: Dominator Tree Construction
+; GCN-O3-NEXT: Cycle Info Analysis
+; GCN-O3-NEXT: Uniformity Analysis
+; GCN-O3-NEXT: AMDGPU atomic optimizations
; GCN-O3-NEXT: Expand Atomic instructions
; GCN-O3-NEXT: AMDGPU Promote Alloca
; GCN-O3-NEXT: Dominator Tree Construction
@@ -1142,14 +1149,11 @@
; GCN-O3-NEXT: Cycle Info Analysis
; GCN-O3-NEXT: Uniformity Analysis
; GCN-O3-NEXT: AMDGPU IR late optimizations
-; GCN-O3-NEXT: AMDGPU atomic optimizations
; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O3-NEXT: Function Alias Analysis Results
; GCN-O3-NEXT: Natural Loop Information
; GCN-O3-NEXT: Code sinking
; GCN-O3-NEXT: Post-Dominator Tree Construction
-; GCN-O3-NEXT: Cycle Info Analysis
-; GCN-O3-NEXT: Uniformity Analysis
; GCN-O3-NEXT: Unify divergent function exit nodes
; GCN-O3-NEXT: Lazy Value Information Analysis
; GCN-O3-NEXT: Lower SwitchInst's to branches
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index 039fc445aa590f..3641bd4ef865dc 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -319,53 +319,78 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
; GFX7-LABEL: lds_ds_fadd:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
-; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
+; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_lshl_b32 s4, s3, 3
-; GFX7-NEXT: s_add_i32 s4, s4, 32
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: ds_read_b32 v0, v0
; GFX7-NEXT: s_add_i32 s3, s3, 4
-; GFX7-NEXT: s_lshl_b32 s6, s3, 3
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX7-NEXT: ; implicit-def: $vgpr1
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7-NEXT: s_cbranch_execz .LBB2_4
+; GFX7-NEXT: ; %bb.1:
+; GFX7-NEXT: s_lshl_b32 s8, s3, 3
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: ds_read_b32 v1, v1
+; GFX7-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s6
+; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2
+; GFX7-NEXT: s_mov_b64 s[6:7], 0
+; GFX7-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v1, v0
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_add_f32_e32 v2, 0x42280000, v1
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_add_f32_e32 v4, v3, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2
+; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v1, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB2_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB2_2
+; GFX7-NEXT: ; %bb.3: ; %Flow15
+; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT: .LBB2_4: ; %Flow16
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: v_readfirstlane_b32 s8, v1
+; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s6, 0
+; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s7, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7-NEXT: s_cbranch_execz .LBB2_7
+; GFX7-NEXT: ; %bb.5:
; GFX7-NEXT: s_lshl_b32 s3, s3, 4
; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: ds_read_b32 v1, v1
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB2_3: ; %atomicrmw.start2
+; GFX7-NEXT: ds_read_b32 v2, v1
+; GFX7-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v1, s6
+; GFX7-NEXT: v_mul_f32_e32 v1, 0x42280000, v1
+; GFX7-NEXT: s_mov_b64 s[6:7], 0
+; GFX7-NEXT: .LBB2_6: ; %atomicrmw.start2
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v2, 0x42280000, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
+; GFX7-NEXT: v_add_f32_e32 v3, v2, v1
+; GFX7-NEXT: v_mov_b32_e32 v4, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v1, v2
+; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB2_3
-; GFX7-NEXT: ; %bb.4: ; %atomicrmw.end1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, v3
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB2_6
+; GFX7-NEXT: .LBB2_7: ; %Flow14
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: ds_read_b32 v1, v1
+; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0
+; GFX7-NEXT: v_add_f32_e32 v0, s8, v0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB2_5: ; %atomicrmw.start8
+; GFX7-NEXT: .LBB2_8: ; %atomicrmw.start8
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, v1
@@ -377,8 +402,8 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB2_5
-; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end7
+; GFX7-NEXT: s_cbranch_execnz .LBB2_8
+; GFX7-NEXT: ; %bb.9: ; %atomicrmw.end7
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -390,52 +415,78 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
; GFX8-LABEL: lds_ds_fadd:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
-; GFX8-NEXT: s_mov_b32 m0, -1
+; GFX8-NEXT: s_mov_b64 s[6:7], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshl_b32 s4, s3, 3
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: ds_read_b32 v0, v0 offset:32
; GFX8-NEXT: s_add_i32 s3, s3, 4
-; GFX8-NEXT: s_lshl_b32 s6, s3, 3
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start
+; GFX8-NEXT: ; implicit-def: $vgpr1
+; GFX8-NEXT: s_mov_b32 m0, -1
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_cbranch_execz .LBB2_4
+; GFX8-NEXT: ; %bb.1:
+; GFX8-NEXT: s_lshl_b32 s8, s3, 3
+; GFX8-NEXT: v_mov_b32_e32 v1, s8
+; GFX8-NEXT: ds_read_b32 v1, v1
+; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, s6
+; GFX8-NEXT: v_mul_f32_e32 v2, 0x42280000, v2
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_add_f32_e32 v2, 0x42280000, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, s8
+; GFX8-NEXT: v_add_f32_e32 v4, v3, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2
+; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v1, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB2_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB2_2
+; GFX8-NEXT: ; %bb.3: ; %Flow17
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: .LBB2_4: ; %Flow18
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_mov_b64 s[6:7], exec
+; GFX8-NEXT: v_readfirstlane_b32 s8, v1
+; GFX8-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s6, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s7, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_cbranch_execz .LBB2_7
+; GFX8-NEXT: ; %bb.5:
; GFX8-NEXT: s_lshl_b32 s3, s3, 4
; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: ds_read_b32 v1, v1
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB2_3: ; %atomicrmw.start2
+; GFX8-NEXT: ds_read_b32 v2, v1
+; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s6
+; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: .LBB2_6: ; %atomicrmw.start2
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_add_f32_e32 v2, 0x42280000, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_f32_e32 v3, v2, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v3, v1, v2
+; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB2_3
-; GFX8-NEXT: ; %bb.4: ; %atomicrmw.end1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB2_6
+; GFX8-NEXT: .LBB2_7: ; %Flow16
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: ds_read_b32 v1, v1
+; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x42280000, v0
+; GFX8-NEXT: v_add_f32_e32 v0, s8, v0
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB2_5: ; %atomicrmw.start8
+; GFX8-NEXT: .LBB2_8: ; %atomicrmw.start8
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, v1
@@ -447,8 +498,8 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB2_5
-; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end7
+; GFX8-NEXT: s_cbranch_execnz .LBB2_8
+; GFX8-NEXT: ; %bb.9: ; %atomicrmw.end7
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX8-NEXT: s_mov_b32 s3, 0xf000
@@ -635,51 +686,76 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa
; GFX7-LABEL: lds_ds_fadd_one_as:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
-; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
+; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_lshl_b32 s4, s3, 3
-; GFX7-NEXT: s_add_i32 s4, s4, 32
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: ds_read_b32 v0, v0
; GFX7-NEXT: s_add_i32 s3, s3, 4
-; GFX7-NEXT: s_lshl_b32 s6, s3, 3
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX7-NEXT: ; implicit-def: $vgpr1
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7-NEXT: s_cbranch_execz .LBB3_4
+; GFX7-NEXT: ; %bb.1:
+; GFX7-NEXT: s_lshl_b32 s8, s3, 3
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: ds_read_b32 v1, v1
+; GFX7-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s6
+; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2
+; GFX7-NEXT: s_mov_b64 s[6:7], 0
+; GFX7-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v1, v0
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_add_f32_e32 v2, 0x42280000, v1
-; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_add_f32_e32 v4, v3, v2
+; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v1, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB3_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB3_2
+; GFX7-NEXT: ; %bb.3: ; %Flow15
+; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT: .LBB3_4: ; %Flow16
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: v_readfirstlane_b32 s8, v1
+; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s6, 0
+; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s7, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7-NEXT: s_cbranch_execz .LBB3_7
+; GFX7-NEXT: ; %bb.5:
; GFX7-NEXT: s_lshl_b32 s3, s3, 4
; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: ds_read_b32 v1, v1
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB3_3: ; %atomicrmw.start2
+; GFX7-NEXT: ds_read_b32 v2, v1
+; GFX7-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v1, s6
+; GFX7-NEXT: v_mul_f32_e32 v1, 0x42280000, v1
+; GFX7-NEXT: s_mov_b64 s[6:7], 0
+; GFX7-NEXT: .LBB3_6: ; %atomicrmw.start2
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v2, 0x42280000, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v1, v2
+; GFX7-NEXT: v_add_f32_e32 v3, v2, v1
+; GFX7-NEXT: v_mov_b32_e32 v4, s3
+; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB3_3
-; GFX7-NEXT: ; %bb.4: ; %atomicrmw.end1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, v3
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB3_6
+; GFX7-NEXT: .LBB3_7: ; %Flow14
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: ds_read_b32 v1, v1
+; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0
+; GFX7-NEXT: v_add_f32_e32 v0, s8, v0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB3_5: ; %atomicrmw.start8
+; GFX7-NEXT: .LBB3_8: ; %atomicrmw.start8
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, v1
@@ -690,8 +766,8 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB3_5
-; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end7
+; GFX7-NEXT: s_cbranch_execnz .LBB3_8
+; GFX7-NEXT: ; %bb.9: ; %atomicrmw.end7
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -703,50 +779,76 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa
; GFX8-LABEL: lds_ds_fadd_one_as:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
-; GFX8-NEXT: s_mov_b32 m0, -1
+; GFX8-NEXT: s_mov_b64 s[6:7], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshl_b32 s4, s3, 3
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: ds_read_b32 v0, v0 offset:32
; GFX8-NEXT: s_add_i32 s3, s3, 4
-; GFX8-NEXT: s_lshl_b32 s6, s3, 3
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX8-NEXT: ; implicit-def: $vgpr1
+; GFX8-NEXT: s_mov_b32 m0, -1
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_cbranch_execz .LBB3_4
+; GFX8-NEXT: ; %bb.1:
+; GFX8-NEXT: s_lshl_b32 s8, s3, 3
+; GFX8-NEXT: v_mov_b32_e32 v1, s8
+; GFX8-NEXT: ds_read_b32 v1, v1
+; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, s6
+; GFX8-NEXT: v_mul_f32_e32 v2, 0x42280000, v2
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: .LBB3_2: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_add_f32_e32 v2, 0x42280000, v1
-; GFX8-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, s8
+; GFX8-NEXT: v_add_f32_e32 v4, v3, v2
+; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v1, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB3_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB3_2
+; GFX8-NEXT: ; %bb.3: ; %Flow17
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: .LBB3_4: ; %Flow18
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_mov_b64 s[6:7], exec
+; GFX8-NEXT: v_readfirstlane_b32 s8, v1
+; GFX8-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s6, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s7, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_cbranch_execz .LBB3_7
+; GFX8-NEXT: ; %bb.5:
; GFX8-NEXT: s_lshl_b32 s3, s3, 4
; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: ds_read_b32 v1, v1
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB3_3: ; %atomicrmw.start2
+; GFX8-NEXT: ds_read_b32 v2, v1
+; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s6
+; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: .LBB3_6: ; %atomicrmw.start2
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_add_f32_e32 v2, 0x42280000, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v3, v1, v2
+; GFX8-NEXT: v_add_f32_e32 v3, v2, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, s3
+; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB3_3
-; GFX8-NEXT: ; %bb.4: ; %atomicrmw.end1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB3_6
+; GFX8-NEXT: .LBB3_7: ; %Flow16
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: ds_read_b32 v1, v1
+; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x42280000, v0
+; GFX8-NEXT: v_add_f32_e32 v0, s8, v0
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB3_5: ; %atomicrmw.start8
+; GFX8-NEXT: .LBB3_8: ; %atomicrmw.start8
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, v1
@@ -757,8 +859,8 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB3_5
-; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end7
+; GFX8-NEXT: s_cbranch_execnz .LBB3_8
+; GFX8-NEXT: ; %bb.9: ; %atomicrmw.end7
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX8-NEXT: s_mov_b32 s3, 0xf000
More information about the llvm-commits
mailing list